|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9990884229717412, |
|
"eval_steps": 500, |
|
"global_step": 274, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0036463081130355514, |
|
"grad_norm": 1.5564329067676301, |
|
"learning_rate": 1.818181818181818e-06, |
|
"loss": 1.699, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.007292616226071103, |
|
"grad_norm": 44902.29063456451, |
|
"learning_rate": 3.636363636363636e-06, |
|
"loss": 1.7642, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.014585232452142206, |
|
"grad_norm": 1.8353451125445739, |
|
"learning_rate": 7.272727272727272e-06, |
|
"loss": 1.7172, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02187784867821331, |
|
"grad_norm": 1.8226533138713261, |
|
"learning_rate": 1.0909090909090909e-05, |
|
"loss": 1.7472, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.02917046490428441, |
|
"grad_norm": 51.31275799965548, |
|
"learning_rate": 1.4545454545454545e-05, |
|
"loss": 1.6743, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03646308113035551, |
|
"grad_norm": 2.4177896118244924, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 1.6505, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04375569735642662, |
|
"grad_norm": 1.4056780909392894, |
|
"learning_rate": 2.1818181818181818e-05, |
|
"loss": 1.662, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05104831358249772, |
|
"grad_norm": 1.123663560927214, |
|
"learning_rate": 2.5454545454545454e-05, |
|
"loss": 1.6014, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.05834092980856882, |
|
"grad_norm": 0.660170892307739, |
|
"learning_rate": 2.909090909090909e-05, |
|
"loss": 1.5094, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.06563354603463993, |
|
"grad_norm": 0.42755156023092894, |
|
"learning_rate": 3.272727272727273e-05, |
|
"loss": 1.5272, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.07292616226071102, |
|
"grad_norm": 0.3829807576012777, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 1.4067, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08021877848678213, |
|
"grad_norm": 0.41242113134912567, |
|
"learning_rate": 4e-05, |
|
"loss": 1.3437, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.08751139471285324, |
|
"grad_norm": 0.4213871682292563, |
|
"learning_rate": 4.3636363636363636e-05, |
|
"loss": 1.3318, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.09480401093892434, |
|
"grad_norm": 0.3915162267949715, |
|
"learning_rate": 4.7272727272727275e-05, |
|
"loss": 1.3009, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.10209662716499544, |
|
"grad_norm": 0.37294398118323746, |
|
"learning_rate": 5.090909090909091e-05, |
|
"loss": 1.2709, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.10938924339106655, |
|
"grad_norm": 0.3180941760614004, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 1.1872, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11668185961713765, |
|
"grad_norm": 0.22743090816603856, |
|
"learning_rate": 5.818181818181818e-05, |
|
"loss": 1.1793, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.12397447584320875, |
|
"grad_norm": 0.15136600844382636, |
|
"learning_rate": 6.181818181818182e-05, |
|
"loss": 1.137, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.13126709206927986, |
|
"grad_norm": 0.14922343265678947, |
|
"learning_rate": 6.545454545454546e-05, |
|
"loss": 1.1545, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.13855970829535097, |
|
"grad_norm": 0.11950915139776307, |
|
"learning_rate": 6.90909090909091e-05, |
|
"loss": 1.0923, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.14585232452142205, |
|
"grad_norm": 0.12975286429339442, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 1.0946, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15314494074749316, |
|
"grad_norm": 0.11636352912910496, |
|
"learning_rate": 7.636363636363637e-05, |
|
"loss": 1.0651, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.16043755697356427, |
|
"grad_norm": 0.12013810386339561, |
|
"learning_rate": 8e-05, |
|
"loss": 1.0725, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.16773017319963537, |
|
"grad_norm": 0.09545625176758893, |
|
"learning_rate": 8.363636363636364e-05, |
|
"loss": 1.1092, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.17502278942570648, |
|
"grad_norm": 0.08240688428637283, |
|
"learning_rate": 8.727272727272727e-05, |
|
"loss": 1.0449, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.18231540565177756, |
|
"grad_norm": 0.08336026564156031, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 1.053, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18960802187784867, |
|
"grad_norm": 0.08733239972775936, |
|
"learning_rate": 9.454545454545455e-05, |
|
"loss": 1.0629, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.19690063810391978, |
|
"grad_norm": 0.07845494536138678, |
|
"learning_rate": 9.818181818181818e-05, |
|
"loss": 1.0642, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.2041932543299909, |
|
"grad_norm": 0.0792882467275035, |
|
"learning_rate": 9.999977231314127e-05, |
|
"loss": 1.0824, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.211485870556062, |
|
"grad_norm": 0.07831448467807114, |
|
"learning_rate": 9.999795083071328e-05, |
|
"loss": 1.0554, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.2187784867821331, |
|
"grad_norm": 0.09325192568500952, |
|
"learning_rate": 9.999430793221355e-05, |
|
"loss": 1.0361, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22607110300820418, |
|
"grad_norm": 0.08001023877218082, |
|
"learning_rate": 9.998884375035221e-05, |
|
"loss": 1.0501, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.2333637192342753, |
|
"grad_norm": 0.08525503053749453, |
|
"learning_rate": 9.99815584841884e-05, |
|
"loss": 1.0421, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.2406563354603464, |
|
"grad_norm": 0.18952595666861716, |
|
"learning_rate": 9.997245239912299e-05, |
|
"loss": 1.0598, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.2479489516864175, |
|
"grad_norm": 0.5356478609043368, |
|
"learning_rate": 9.996152582688898e-05, |
|
"loss": 1.0195, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.2552415679124886, |
|
"grad_norm": 0.07439723408480747, |
|
"learning_rate": 9.994877916553938e-05, |
|
"loss": 1.0119, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2625341841385597, |
|
"grad_norm": 0.08769208744835676, |
|
"learning_rate": 9.993421287943269e-05, |
|
"loss": 1.0367, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.2698268003646308, |
|
"grad_norm": 0.07727464499454006, |
|
"learning_rate": 9.991782749921601e-05, |
|
"loss": 1.0222, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.27711941659070194, |
|
"grad_norm": 0.07684989155110282, |
|
"learning_rate": 9.98996236218057e-05, |
|
"loss": 1.0291, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.284412032816773, |
|
"grad_norm": 0.074797273929808, |
|
"learning_rate": 9.987960191036562e-05, |
|
"loss": 1.0265, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.2917046490428441, |
|
"grad_norm": 0.07808867527853167, |
|
"learning_rate": 9.985776309428305e-05, |
|
"loss": 1.0394, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.29899726526891524, |
|
"grad_norm": 0.08366367602394825, |
|
"learning_rate": 9.983410796914196e-05, |
|
"loss": 0.9918, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.3062898814949863, |
|
"grad_norm": 0.08235936275195418, |
|
"learning_rate": 9.98086373966942e-05, |
|
"loss": 1.0093, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.31358249772105745, |
|
"grad_norm": 0.07114879422193107, |
|
"learning_rate": 9.978135230482797e-05, |
|
"loss": 1.0331, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.32087511394712853, |
|
"grad_norm": 0.0789659528845, |
|
"learning_rate": 9.975225368753412e-05, |
|
"loss": 1.0083, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.3281677301731996, |
|
"grad_norm": 0.12722043213844833, |
|
"learning_rate": 9.972134260486988e-05, |
|
"loss": 0.9868, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.33546034639927075, |
|
"grad_norm": 0.07460212363414274, |
|
"learning_rate": 9.968862018292026e-05, |
|
"loss": 0.9787, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.34275296262534183, |
|
"grad_norm": 0.0794228233950649, |
|
"learning_rate": 9.965408761375701e-05, |
|
"loss": 0.9983, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.35004557885141296, |
|
"grad_norm": 0.07805234354253066, |
|
"learning_rate": 9.961774615539522e-05, |
|
"loss": 0.9712, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.35733819507748404, |
|
"grad_norm": 0.07531557083466454, |
|
"learning_rate": 9.957959713174748e-05, |
|
"loss": 0.9888, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.3646308113035551, |
|
"grad_norm": 0.07667315677213174, |
|
"learning_rate": 9.953964193257563e-05, |
|
"loss": 0.9782, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.37192342752962626, |
|
"grad_norm": 0.0769553697010856, |
|
"learning_rate": 9.949788201344019e-05, |
|
"loss": 0.9615, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.37921604375569734, |
|
"grad_norm": 0.08212042393870912, |
|
"learning_rate": 9.945431889564723e-05, |
|
"loss": 0.9755, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.3865086599817685, |
|
"grad_norm": 0.08314540306193914, |
|
"learning_rate": 9.940895416619309e-05, |
|
"loss": 1.0069, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.39380127620783956, |
|
"grad_norm": 0.07970050714561497, |
|
"learning_rate": 9.936178947770641e-05, |
|
"loss": 0.9686, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.4010938924339107, |
|
"grad_norm": 0.2747778373047559, |
|
"learning_rate": 9.931282654838803e-05, |
|
"loss": 0.9878, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4083865086599818, |
|
"grad_norm": 0.08106771126002589, |
|
"learning_rate": 9.926206716194842e-05, |
|
"loss": 0.9948, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.41567912488605285, |
|
"grad_norm": 0.08133105923791639, |
|
"learning_rate": 9.920951316754259e-05, |
|
"loss": 0.9621, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.422971741112124, |
|
"grad_norm": 0.07751316025392309, |
|
"learning_rate": 9.915516647970282e-05, |
|
"loss": 1.009, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.43026435733819507, |
|
"grad_norm": 0.07429775187060293, |
|
"learning_rate": 9.909902907826884e-05, |
|
"loss": 0.9564, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.4375569735642662, |
|
"grad_norm": 0.06885396447375916, |
|
"learning_rate": 9.904110300831577e-05, |
|
"loss": 0.9516, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4448495897903373, |
|
"grad_norm": 0.11911641611028377, |
|
"learning_rate": 9.898139038007961e-05, |
|
"loss": 0.9501, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.45214220601640837, |
|
"grad_norm": 0.08134204746867607, |
|
"learning_rate": 9.891989336888032e-05, |
|
"loss": 0.9787, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.4594348222424795, |
|
"grad_norm": 0.06703677306759141, |
|
"learning_rate": 9.88566142150426e-05, |
|
"loss": 0.9383, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.4667274384685506, |
|
"grad_norm": 0.08875939081498214, |
|
"learning_rate": 9.87915552238143e-05, |
|
"loss": 0.9565, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.4740200546946217, |
|
"grad_norm": 0.06975398482601901, |
|
"learning_rate": 9.872471876528236e-05, |
|
"loss": 0.9351, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4813126709206928, |
|
"grad_norm": 0.08668686890512382, |
|
"learning_rate": 9.865610727428661e-05, |
|
"loss": 0.9619, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.4886052871467639, |
|
"grad_norm": 0.07299762770901298, |
|
"learning_rate": 9.858572325033089e-05, |
|
"loss": 0.9666, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.495897903372835, |
|
"grad_norm": 0.22619188588419292, |
|
"learning_rate": 9.851356925749217e-05, |
|
"loss": 0.9564, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.5031905195989061, |
|
"grad_norm": 0.06929553898073587, |
|
"learning_rate": 9.843964792432702e-05, |
|
"loss": 0.9416, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.5104831358249772, |
|
"grad_norm": 0.07296219603236209, |
|
"learning_rate": 9.836396194377586e-05, |
|
"loss": 0.9606, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5177757520510483, |
|
"grad_norm": 0.17258152427782444, |
|
"learning_rate": 9.828651407306495e-05, |
|
"loss": 0.9405, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.5250683682771194, |
|
"grad_norm": 0.08048053517750381, |
|
"learning_rate": 9.820730713360584e-05, |
|
"loss": 0.9308, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.5323609845031905, |
|
"grad_norm": 0.07478421518396797, |
|
"learning_rate": 9.812634401089265e-05, |
|
"loss": 0.9433, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.5396536007292616, |
|
"grad_norm": 0.0744608512059185, |
|
"learning_rate": 9.804362765439688e-05, |
|
"loss": 0.9545, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.5469462169553327, |
|
"grad_norm": 0.07446299592606315, |
|
"learning_rate": 9.795916107746009e-05, |
|
"loss": 0.925, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5542388331814039, |
|
"grad_norm": 0.07071717466745271, |
|
"learning_rate": 9.787294735718397e-05, |
|
"loss": 0.9173, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.5615314494074749, |
|
"grad_norm": 0.0701984606480579, |
|
"learning_rate": 9.778498963431837e-05, |
|
"loss": 0.9082, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.568824065633546, |
|
"grad_norm": 0.07669702930168845, |
|
"learning_rate": 9.769529111314682e-05, |
|
"loss": 0.9497, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.5761166818596172, |
|
"grad_norm": 0.11697404287375447, |
|
"learning_rate": 9.76038550613698e-05, |
|
"loss": 0.9144, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.5834092980856882, |
|
"grad_norm": 0.07825369109874582, |
|
"learning_rate": 9.75106848099857e-05, |
|
"loss": 0.8793, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5907019143117593, |
|
"grad_norm": 1.1005245093691354, |
|
"learning_rate": 9.741578375316952e-05, |
|
"loss": 0.9145, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.5979945305378305, |
|
"grad_norm": 0.08370330168443421, |
|
"learning_rate": 9.731915534814912e-05, |
|
"loss": 0.9511, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.6052871467639015, |
|
"grad_norm": 0.08122736176843745, |
|
"learning_rate": 9.722080311507937e-05, |
|
"loss": 0.9584, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.6125797629899726, |
|
"grad_norm": 0.07648634858832534, |
|
"learning_rate": 9.712073063691386e-05, |
|
"loss": 0.9262, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.6198723792160438, |
|
"grad_norm": 0.08053849014890098, |
|
"learning_rate": 9.701894155927445e-05, |
|
"loss": 0.9313, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6271649954421149, |
|
"grad_norm": 0.07210396245886411, |
|
"learning_rate": 9.69154395903183e-05, |
|
"loss": 0.9174, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.6344576116681859, |
|
"grad_norm": 0.07286947087570095, |
|
"learning_rate": 9.681022850060296e-05, |
|
"loss": 0.8893, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.6417502278942571, |
|
"grad_norm": 0.06923686161728522, |
|
"learning_rate": 9.670331212294889e-05, |
|
"loss": 0.9395, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.6490428441203282, |
|
"grad_norm": 0.0673865610450944, |
|
"learning_rate": 9.659469435229992e-05, |
|
"loss": 0.91, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.6563354603463992, |
|
"grad_norm": 0.06948906848590711, |
|
"learning_rate": 9.648437914558124e-05, |
|
"loss": 0.9168, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6636280765724704, |
|
"grad_norm": 0.07289843532121563, |
|
"learning_rate": 9.63723705215554e-05, |
|
"loss": 0.9552, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.6709206927985415, |
|
"grad_norm": 0.06990802284713049, |
|
"learning_rate": 9.625867256067578e-05, |
|
"loss": 0.9033, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.6782133090246126, |
|
"grad_norm": 0.07555680912326869, |
|
"learning_rate": 9.614328940493798e-05, |
|
"loss": 0.9206, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.6855059252506837, |
|
"grad_norm": 0.07743540693217674, |
|
"learning_rate": 9.602622525772895e-05, |
|
"loss": 0.9005, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.6927985414767548, |
|
"grad_norm": 0.07177992266999077, |
|
"learning_rate": 9.590748438367388e-05, |
|
"loss": 0.9041, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7000911577028259, |
|
"grad_norm": 0.06957828378926412, |
|
"learning_rate": 9.578707110848078e-05, |
|
"loss": 0.8957, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.707383773928897, |
|
"grad_norm": 0.06622017193744313, |
|
"learning_rate": 9.56649898187829e-05, |
|
"loss": 0.9108, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.7146763901549681, |
|
"grad_norm": 0.06939738922827116, |
|
"learning_rate": 9.554124496197898e-05, |
|
"loss": 0.9467, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.7219690063810392, |
|
"grad_norm": 0.06510687273754218, |
|
"learning_rate": 9.54158410460712e-05, |
|
"loss": 0.9101, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.7292616226071102, |
|
"grad_norm": 0.12122407982819745, |
|
"learning_rate": 9.528878263950094e-05, |
|
"loss": 0.9271, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7365542388331814, |
|
"grad_norm": 0.0703901780763089, |
|
"learning_rate": 9.516007437098237e-05, |
|
"loss": 0.9162, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.7438468550592525, |
|
"grad_norm": 0.13768411678101625, |
|
"learning_rate": 9.502972092933384e-05, |
|
"loss": 0.8917, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.7511394712853237, |
|
"grad_norm": 0.13103063463158354, |
|
"learning_rate": 9.489772706330706e-05, |
|
"loss": 0.9074, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.7584320875113947, |
|
"grad_norm": 0.07056505425629164, |
|
"learning_rate": 9.476409758141405e-05, |
|
"loss": 0.9288, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.7657247037374658, |
|
"grad_norm": 0.07383435280021707, |
|
"learning_rate": 9.462883735175205e-05, |
|
"loss": 0.9059, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.773017319963537, |
|
"grad_norm": 0.1015687322518938, |
|
"learning_rate": 9.449195130182613e-05, |
|
"loss": 0.9193, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.780309936189608, |
|
"grad_norm": 0.07104451767439372, |
|
"learning_rate": 9.435344441836968e-05, |
|
"loss": 0.9097, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.7876025524156791, |
|
"grad_norm": 0.06852107601664209, |
|
"learning_rate": 9.42133217471628e-05, |
|
"loss": 0.9126, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.7948951686417502, |
|
"grad_norm": 0.07368568637779982, |
|
"learning_rate": 9.407158839284835e-05, |
|
"loss": 0.945, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.8021877848678214, |
|
"grad_norm": 0.0648178325805628, |
|
"learning_rate": 9.392824951874617e-05, |
|
"loss": 0.8912, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8094804010938924, |
|
"grad_norm": 0.06456581944586823, |
|
"learning_rate": 9.378331034666484e-05, |
|
"loss": 0.8899, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.8167730173199635, |
|
"grad_norm": 0.06900542842249857, |
|
"learning_rate": 9.363677615671148e-05, |
|
"loss": 0.9119, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.8240656335460347, |
|
"grad_norm": 0.06967833214883962, |
|
"learning_rate": 9.348865228709947e-05, |
|
"loss": 0.889, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.8313582497721057, |
|
"grad_norm": 0.08360540263683244, |
|
"learning_rate": 9.333894413395387e-05, |
|
"loss": 0.865, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.8386508659981768, |
|
"grad_norm": 0.07306384866137224, |
|
"learning_rate": 9.318765715111497e-05, |
|
"loss": 0.9074, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.845943482224248, |
|
"grad_norm": 0.07245184365008314, |
|
"learning_rate": 9.303479684993942e-05, |
|
"loss": 0.908, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.853236098450319, |
|
"grad_norm": 0.08363970365051233, |
|
"learning_rate": 9.288036879909968e-05, |
|
"loss": 0.8873, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.8605287146763901, |
|
"grad_norm": 0.07030389813918717, |
|
"learning_rate": 9.272437862438094e-05, |
|
"loss": 0.8869, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.8678213309024613, |
|
"grad_norm": 0.08293652025190876, |
|
"learning_rate": 9.256683200847638e-05, |
|
"loss": 0.871, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.8751139471285324, |
|
"grad_norm": 0.07471078064157549, |
|
"learning_rate": 9.240773469077993e-05, |
|
"loss": 0.8742, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8824065633546034, |
|
"grad_norm": 0.07710454753144116, |
|
"learning_rate": 9.22470924671774e-05, |
|
"loss": 0.8743, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.8896991795806746, |
|
"grad_norm": 0.06889403226940291, |
|
"learning_rate": 9.208491118983514e-05, |
|
"loss": 0.8367, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.8969917958067457, |
|
"grad_norm": 0.07079796419035034, |
|
"learning_rate": 9.192119676698703e-05, |
|
"loss": 0.8699, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.9042844120328167, |
|
"grad_norm": 0.08107811116365189, |
|
"learning_rate": 9.17559551627191e-05, |
|
"loss": 0.8794, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.9115770282588879, |
|
"grad_norm": 0.07964800640935288, |
|
"learning_rate": 9.158919239675236e-05, |
|
"loss": 0.9364, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.918869644484959, |
|
"grad_norm": 0.07802559563260855, |
|
"learning_rate": 9.14209145442234e-05, |
|
"loss": 0.8561, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.92616226071103, |
|
"grad_norm": 0.08673021093662099, |
|
"learning_rate": 9.125112773546315e-05, |
|
"loss": 0.8854, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.9334548769371012, |
|
"grad_norm": 0.073859691158192, |
|
"learning_rate": 9.107983815577359e-05, |
|
"loss": 0.8949, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.9407474931631723, |
|
"grad_norm": 0.06585824434560679, |
|
"learning_rate": 9.090705204520231e-05, |
|
"loss": 0.8642, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.9480401093892434, |
|
"grad_norm": 0.07928660195604606, |
|
"learning_rate": 9.073277569831526e-05, |
|
"loss": 0.8936, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9553327256153145, |
|
"grad_norm": 0.07629458882620191, |
|
"learning_rate": 9.05570154639674e-05, |
|
"loss": 0.889, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.9626253418413856, |
|
"grad_norm": 0.12177477305372565, |
|
"learning_rate": 9.03797777450715e-05, |
|
"loss": 0.8869, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.9699179580674567, |
|
"grad_norm": 0.07512333956042923, |
|
"learning_rate": 9.020106899836472e-05, |
|
"loss": 0.8821, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.9772105742935278, |
|
"grad_norm": 0.07673096729175252, |
|
"learning_rate": 9.002089573417356e-05, |
|
"loss": 0.8406, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.9845031905195989, |
|
"grad_norm": 0.13567692134785442, |
|
"learning_rate": 8.983926451617664e-05, |
|
"loss": 0.8644, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.99179580674567, |
|
"grad_norm": 0.08759393012824235, |
|
"learning_rate": 8.965618196116549e-05, |
|
"loss": 0.844, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.9990884229717412, |
|
"grad_norm": 0.08369780561981159, |
|
"learning_rate": 8.947165473880363e-05, |
|
"loss": 0.8516, |
|
"step": 274 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 1096, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1698152759427072.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|