|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9398359161349132, |
|
"eval_steps": 1.0, |
|
"global_step": 129, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 9.694531367351816, |
|
"learning_rate": 0.0, |
|
"loss": 0.9609, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.0698578450559015, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8693, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.5557897846625535, |
|
"learning_rate": 7.924812503605782e-06, |
|
"loss": 0.8571, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.8249565635426235, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8098, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 7.157097194857861, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7421, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 6.7490063693246185, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7184, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 6.989582963023535, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7101, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.856325406983797, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6815, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.893753189326447, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6975, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.8564394287744745, |
|
"learning_rate": 1e-05, |
|
"loss": 0.65, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.638777805285931, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5817, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 3.701804268792858, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6005, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.08244950250501, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5529, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.210541622720776, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5654, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.0384959005478867, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5184, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.6850393439015092, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5234, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.8596154120661748, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5005, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.726391191331952, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4691, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.934188314387199, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5265, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.496046815968877, |
|
"learning_rate": 1e-05, |
|
"loss": 0.46, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.6527493698931703, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4633, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.3371216207823364, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4512, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.5029789662415576, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4538, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.3654775885807435, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4503, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.520897440850751, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4393, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.2973148314047998, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4246, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.064310434148654, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4466, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.1518579274793614, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4157, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.101663348817292, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4142, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.0792549392682624, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3814, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.4317303715104868, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4042, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.6323607639867443, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3745, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.2663973989673987, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3781, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.2288967526596193, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3894, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.2598649631650223, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3734, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.046710491098762, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3436, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.104660502205773, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3636, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.2194274529750335, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3687, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.380578087514372, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3386, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.1539059012744675, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3663, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.2975008924550884, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3529, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.0215009917813864, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3255, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.260279893370542, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3496, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.1998031778618596, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3295, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.9950078809811038, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2919, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.8802987071836594, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2774, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.330358996793001, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2953, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.187255642034782, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2637, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.246935471483204, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2806, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.129600005729623, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2566, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.1613544928929347, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2703, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.097651271845284, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2578, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.1698187501885404, |
|
"learning_rate": 1e-05, |
|
"loss": 0.266, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.0534602028652733, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2617, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.280247094380378, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2725, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.282981855579156, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2696, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.1723199295603246, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2801, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.1013790638047833, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2462, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.1486192844939187, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2634, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.2011921615871874, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2644, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.0994036223733907, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2366, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 1.8682173502881247, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2369, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.9469094762833548, |
|
"learning_rate": 1e-05, |
|
"loss": 0.243, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 2.198810958517713, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2538, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.9614171222986219, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2209, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.1909792689278924, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2289, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.219018585854138, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2438, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.095328922178155, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2384, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 2.1047747885459596, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2511, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.217218508465867, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2326, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.2940614424982364, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2095, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.0023231563553012, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2178, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 1.8803020902826912, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2299, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 1.9874036980658476, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2085, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 1.9993267804137187, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2189, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.075032021080106, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2122, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.9908316147368204, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2169, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.9737029328293805, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1976, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.069169880586868, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2193, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.9730042900890021, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1959, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 1.9487990391437768, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2099, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.2136709023857923, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2064, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 1.867453149915136, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2148, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 2.2341488312462374, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2152, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 1.917448307480571, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2087, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.1010824249510938, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1989, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.9948658919746771, |
|
"learning_rate": 1e-05, |
|
"loss": 0.184, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 1.7700018111142861, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1755, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.7706928580089987, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1524, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 1.924950952533756, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1639, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 1.8520437499999318, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1628, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.9282660088754877, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1738, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 2.152060500211321, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1533, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 2.1072066102911204, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1699, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.9056526630981285, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1542, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 2.0541561878206815, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1654, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.9010874038699952, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1284, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 1.8180452474072457, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1294, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.9510507921351066, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1555, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 2.16385305667214, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1568, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.778578736374391, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1493, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.7020886640236346, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1365, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.9239197373726553, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1529, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.8300125885434804, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1507, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 1.7740533757030397, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1404, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 1.921607088125746, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1403, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.7244219031686432, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1493, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.840637334327782, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1555, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.7138834314865232, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1507, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.8869772327906467, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1519, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.722031710475424, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1388, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 1.8294129933303556, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1429, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 1.913994003850658, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1391, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.9673824351683147, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1374, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.9673399496447597, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1458, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.8027392959029291, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1351, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 1.667565922975832, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1455, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.8116940749067432, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1502, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.9922161139058774, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1422, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 2.0124481055349044, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1373, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 1.800014793589776, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1342, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 1.7532349977825892, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1262, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.7857219830778754, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1375, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 1.7910722257498661, |
|
"learning_rate": 1e-05, |
|
"loss": 0.148, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.6958921797341877, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1287, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 1.9268954167687131, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1334, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 1.7925408289852067, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1445, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.9380903694792881, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1478, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.8550832947077076, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1185, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"step": 129, |
|
"total_flos": 56540883877888.0, |
|
"train_loss": 0.30087860679441647, |
|
"train_runtime": 2488.5787, |
|
"train_samples_per_second": 6.613, |
|
"train_steps_per_second": 0.052 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 129, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1.0, |
|
"total_flos": 56540883877888.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|