|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.982608695652174, |
|
"eval_steps": 87, |
|
"global_step": 690, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002898550724637681, |
|
"grad_norm": 0.44052618741989136, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.4473, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002898550724637681, |
|
"eval_loss": 1.4117156267166138, |
|
"eval_runtime": 46.1446, |
|
"eval_samples_per_second": 5.548, |
|
"eval_steps_per_second": 0.693, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005797101449275362, |
|
"grad_norm": 0.4932183027267456, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.3923, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.008695652173913044, |
|
"grad_norm": 0.4844379723072052, |
|
"learning_rate": 3e-06, |
|
"loss": 1.4468, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.011594202898550725, |
|
"grad_norm": 0.5023930668830872, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.3773, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.014492753623188406, |
|
"grad_norm": 0.483876496553421, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4103, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.017391304347826087, |
|
"grad_norm": 0.4460753798484802, |
|
"learning_rate": 6e-06, |
|
"loss": 1.4707, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.020289855072463767, |
|
"grad_norm": 0.4342319369316101, |
|
"learning_rate": 7e-06, |
|
"loss": 1.3563, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02318840579710145, |
|
"grad_norm": 0.479257196187973, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.414, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02608695652173913, |
|
"grad_norm": 0.5028970241546631, |
|
"learning_rate": 9e-06, |
|
"loss": 1.4601, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.028985507246376812, |
|
"grad_norm": 0.49131453037261963, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4364, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03188405797101449, |
|
"grad_norm": 0.5517832040786743, |
|
"learning_rate": 9.999946639344475e-06, |
|
"loss": 1.4873, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.034782608695652174, |
|
"grad_norm": 0.5310211181640625, |
|
"learning_rate": 9.99978655851684e-06, |
|
"loss": 1.4346, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03768115942028986, |
|
"grad_norm": 0.4639141857624054, |
|
"learning_rate": 9.999519760933905e-06, |
|
"loss": 1.4402, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.04057971014492753, |
|
"grad_norm": 0.47811073064804077, |
|
"learning_rate": 9.999146252290264e-06, |
|
"loss": 1.4106, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.043478260869565216, |
|
"grad_norm": 0.5223386883735657, |
|
"learning_rate": 9.998666040558187e-06, |
|
"loss": 1.3732, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0463768115942029, |
|
"grad_norm": 0.5601791143417358, |
|
"learning_rate": 9.998079135987437e-06, |
|
"loss": 1.4166, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04927536231884058, |
|
"grad_norm": 0.5459745526313782, |
|
"learning_rate": 9.997385551105061e-06, |
|
"loss": 1.4501, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.05217391304347826, |
|
"grad_norm": 0.6155043244361877, |
|
"learning_rate": 9.996585300715117e-06, |
|
"loss": 1.3987, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.05507246376811594, |
|
"grad_norm": 0.539135754108429, |
|
"learning_rate": 9.995678401898354e-06, |
|
"loss": 1.3943, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.057971014492753624, |
|
"grad_norm": 0.5232663154602051, |
|
"learning_rate": 9.994664874011864e-06, |
|
"loss": 1.3742, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06086956521739131, |
|
"grad_norm": 0.4995758533477783, |
|
"learning_rate": 9.993544738688647e-06, |
|
"loss": 1.3969, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06376811594202898, |
|
"grad_norm": 0.5397970080375671, |
|
"learning_rate": 9.992318019837171e-06, |
|
"loss": 1.3238, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06666666666666667, |
|
"grad_norm": 0.5533668994903564, |
|
"learning_rate": 9.990984743640839e-06, |
|
"loss": 1.3717, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06956521739130435, |
|
"grad_norm": 0.5304050445556641, |
|
"learning_rate": 9.989544938557453e-06, |
|
"loss": 1.3565, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07246376811594203, |
|
"grad_norm": 0.5658550262451172, |
|
"learning_rate": 9.987998635318586e-06, |
|
"loss": 1.3075, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07536231884057971, |
|
"grad_norm": 0.5798805952072144, |
|
"learning_rate": 9.98634586692894e-06, |
|
"loss": 1.4202, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0782608695652174, |
|
"grad_norm": 0.49352607131004333, |
|
"learning_rate": 9.984586668665641e-06, |
|
"loss": 1.3172, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.08115942028985507, |
|
"grad_norm": 0.576454222202301, |
|
"learning_rate": 9.982721078077474e-06, |
|
"loss": 1.3633, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.08405797101449275, |
|
"grad_norm": 0.5843266248703003, |
|
"learning_rate": 9.980749134984094e-06, |
|
"loss": 1.3031, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.08695652173913043, |
|
"grad_norm": 0.5863199234008789, |
|
"learning_rate": 9.978670881475173e-06, |
|
"loss": 1.3228, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08985507246376812, |
|
"grad_norm": 0.6071418523788452, |
|
"learning_rate": 9.9764863619095e-06, |
|
"loss": 1.3277, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0927536231884058, |
|
"grad_norm": 0.5361754298210144, |
|
"learning_rate": 9.97419562291403e-06, |
|
"loss": 1.3189, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09565217391304348, |
|
"grad_norm": 0.6043053865432739, |
|
"learning_rate": 9.971798713382896e-06, |
|
"loss": 1.2567, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.09855072463768116, |
|
"grad_norm": 0.4795907139778137, |
|
"learning_rate": 9.96929568447637e-06, |
|
"loss": 1.33, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.10144927536231885, |
|
"grad_norm": 0.5752019882202148, |
|
"learning_rate": 9.96668658961975e-06, |
|
"loss": 1.1915, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.10434782608695652, |
|
"grad_norm": 0.47888195514678955, |
|
"learning_rate": 9.963971484502247e-06, |
|
"loss": 1.2753, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.1072463768115942, |
|
"grad_norm": 0.5371452569961548, |
|
"learning_rate": 9.96115042707577e-06, |
|
"loss": 1.2659, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.11014492753623188, |
|
"grad_norm": 0.6198606491088867, |
|
"learning_rate": 9.958223477553715e-06, |
|
"loss": 1.2166, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.11304347826086956, |
|
"grad_norm": 0.4718591272830963, |
|
"learning_rate": 9.955190698409656e-06, |
|
"loss": 1.2708, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.11594202898550725, |
|
"grad_norm": 0.5691114068031311, |
|
"learning_rate": 9.952052154376027e-06, |
|
"loss": 1.2074, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11884057971014493, |
|
"grad_norm": 0.515771210193634, |
|
"learning_rate": 9.948807912442735e-06, |
|
"loss": 1.1958, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.12173913043478261, |
|
"grad_norm": 0.6830301880836487, |
|
"learning_rate": 9.945458041855732e-06, |
|
"loss": 1.2992, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1246376811594203, |
|
"grad_norm": 0.5583641529083252, |
|
"learning_rate": 9.94200261411553e-06, |
|
"loss": 1.2654, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.12753623188405797, |
|
"grad_norm": 0.5985351800918579, |
|
"learning_rate": 9.938441702975689e-06, |
|
"loss": 1.2064, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.13043478260869565, |
|
"grad_norm": 0.5092725157737732, |
|
"learning_rate": 9.93477538444123e-06, |
|
"loss": 1.1477, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 0.5719948410987854, |
|
"learning_rate": 9.931003736767013e-06, |
|
"loss": 1.3045, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.13623188405797101, |
|
"grad_norm": 0.5000984072685242, |
|
"learning_rate": 9.92712684045608e-06, |
|
"loss": 1.2954, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1391304347826087, |
|
"grad_norm": 0.6268609762191772, |
|
"learning_rate": 9.923144778257918e-06, |
|
"loss": 1.2742, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.14202898550724638, |
|
"grad_norm": 0.5395749807357788, |
|
"learning_rate": 9.91905763516671e-06, |
|
"loss": 1.1651, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.14492753623188406, |
|
"grad_norm": 0.6797102689743042, |
|
"learning_rate": 9.91486549841951e-06, |
|
"loss": 1.2083, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14782608695652175, |
|
"grad_norm": 0.554821252822876, |
|
"learning_rate": 9.91056845749438e-06, |
|
"loss": 1.1623, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.15072463768115943, |
|
"grad_norm": 0.6033896803855896, |
|
"learning_rate": 9.906166604108494e-06, |
|
"loss": 1.2135, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1536231884057971, |
|
"grad_norm": 0.568701446056366, |
|
"learning_rate": 9.901660032216159e-06, |
|
"loss": 1.1956, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.1565217391304348, |
|
"grad_norm": 0.6862343549728394, |
|
"learning_rate": 9.89704883800683e-06, |
|
"loss": 1.1992, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.15942028985507245, |
|
"grad_norm": 0.49399352073669434, |
|
"learning_rate": 9.892333119903045e-06, |
|
"loss": 1.1711, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.16231884057971013, |
|
"grad_norm": 0.5683416724205017, |
|
"learning_rate": 9.887512978558329e-06, |
|
"loss": 1.2608, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.16521739130434782, |
|
"grad_norm": 0.4855175018310547, |
|
"learning_rate": 9.88258851685504e-06, |
|
"loss": 1.1652, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.1681159420289855, |
|
"grad_norm": 0.5765471458435059, |
|
"learning_rate": 9.877559839902185e-06, |
|
"loss": 1.2653, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.17101449275362318, |
|
"grad_norm": 0.5921582579612732, |
|
"learning_rate": 9.872427055033156e-06, |
|
"loss": 1.1191, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 0.5046260356903076, |
|
"learning_rate": 9.867190271803466e-06, |
|
"loss": 1.1824, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17681159420289855, |
|
"grad_norm": 0.5180432796478271, |
|
"learning_rate": 9.861849601988384e-06, |
|
"loss": 1.1736, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.17971014492753623, |
|
"grad_norm": 0.65400230884552, |
|
"learning_rate": 9.85640515958057e-06, |
|
"loss": 1.1129, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1826086956521739, |
|
"grad_norm": 0.5726003646850586, |
|
"learning_rate": 9.85085706078763e-06, |
|
"loss": 1.1567, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1855072463768116, |
|
"grad_norm": 0.5297178030014038, |
|
"learning_rate": 9.845205424029639e-06, |
|
"loss": 1.101, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.18840579710144928, |
|
"grad_norm": 0.5242377519607544, |
|
"learning_rate": 9.839450369936615e-06, |
|
"loss": 1.174, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.19130434782608696, |
|
"grad_norm": 0.5277882218360901, |
|
"learning_rate": 9.833592021345938e-06, |
|
"loss": 1.1772, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.19420289855072465, |
|
"grad_norm": 0.5334244966506958, |
|
"learning_rate": 9.827630503299741e-06, |
|
"loss": 1.1722, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.19710144927536233, |
|
"grad_norm": 0.6054286360740662, |
|
"learning_rate": 9.821565943042225e-06, |
|
"loss": 1.2022, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5691675543785095, |
|
"learning_rate": 9.815398470016957e-06, |
|
"loss": 1.1256, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.2028985507246377, |
|
"grad_norm": 0.4579974114894867, |
|
"learning_rate": 9.809128215864096e-06, |
|
"loss": 1.1548, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.20579710144927535, |
|
"grad_norm": 0.605627715587616, |
|
"learning_rate": 9.802755314417592e-06, |
|
"loss": 1.0972, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.20869565217391303, |
|
"grad_norm": 0.5655208826065063, |
|
"learning_rate": 9.796279901702326e-06, |
|
"loss": 1.0902, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.21159420289855072, |
|
"grad_norm": 0.570743978023529, |
|
"learning_rate": 9.789702115931202e-06, |
|
"loss": 1.0654, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.2144927536231884, |
|
"grad_norm": 0.7513704895973206, |
|
"learning_rate": 9.783022097502204e-06, |
|
"loss": 1.1348, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.21739130434782608, |
|
"grad_norm": 0.592363715171814, |
|
"learning_rate": 9.776239988995401e-06, |
|
"loss": 1.1733, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.22028985507246376, |
|
"grad_norm": 0.5394357442855835, |
|
"learning_rate": 9.76935593516989e-06, |
|
"loss": 1.1313, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.22318840579710145, |
|
"grad_norm": 0.598983108997345, |
|
"learning_rate": 9.762370082960727e-06, |
|
"loss": 1.1077, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.22608695652173913, |
|
"grad_norm": 0.5635719895362854, |
|
"learning_rate": 9.755282581475769e-06, |
|
"loss": 1.0393, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.2289855072463768, |
|
"grad_norm": 0.5638449788093567, |
|
"learning_rate": 9.748093581992506e-06, |
|
"loss": 1.1126, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.2318840579710145, |
|
"grad_norm": 0.5267054438591003, |
|
"learning_rate": 9.74080323795483e-06, |
|
"loss": 1.108, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.23478260869565218, |
|
"grad_norm": 0.69565749168396, |
|
"learning_rate": 9.733411704969754e-06, |
|
"loss": 1.1065, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.23768115942028986, |
|
"grad_norm": 0.5769387483596802, |
|
"learning_rate": 9.7259191408041e-06, |
|
"loss": 1.0892, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.24057971014492754, |
|
"grad_norm": 0.4646681845188141, |
|
"learning_rate": 9.718325705381115e-06, |
|
"loss": 1.0984, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.24347826086956523, |
|
"grad_norm": 0.5441101789474487, |
|
"learning_rate": 9.710631560777082e-06, |
|
"loss": 1.134, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2463768115942029, |
|
"grad_norm": 0.6711792349815369, |
|
"learning_rate": 9.702836871217838e-06, |
|
"loss": 1.118, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2492753623188406, |
|
"grad_norm": 0.6086435914039612, |
|
"learning_rate": 9.694941803075285e-06, |
|
"loss": 1.1332, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.25217391304347825, |
|
"grad_norm": 0.6047069430351257, |
|
"learning_rate": 9.686946524863821e-06, |
|
"loss": 1.0948, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.25217391304347825, |
|
"eval_loss": 1.093648910522461, |
|
"eval_runtime": 46.2827, |
|
"eval_samples_per_second": 5.531, |
|
"eval_steps_per_second": 0.691, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.25507246376811593, |
|
"grad_norm": 0.5494099259376526, |
|
"learning_rate": 9.678851207236764e-06, |
|
"loss": 1.0677, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2579710144927536, |
|
"grad_norm": 0.6029177308082581, |
|
"learning_rate": 9.670656022982696e-06, |
|
"loss": 1.1122, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.2608695652173913, |
|
"grad_norm": 0.6882422566413879, |
|
"learning_rate": 9.66236114702178e-06, |
|
"loss": 1.131, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.263768115942029, |
|
"grad_norm": 0.5858222246170044, |
|
"learning_rate": 9.65396675640202e-06, |
|
"loss": 1.0904, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.6096974611282349, |
|
"learning_rate": 9.645473030295496e-06, |
|
"loss": 1.1001, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.26956521739130435, |
|
"grad_norm": 0.5705183148384094, |
|
"learning_rate": 9.636880149994518e-06, |
|
"loss": 1.1159, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.27246376811594203, |
|
"grad_norm": 0.5896604061126709, |
|
"learning_rate": 9.628188298907782e-06, |
|
"loss": 1.0236, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2753623188405797, |
|
"grad_norm": 0.6060263514518738, |
|
"learning_rate": 9.619397662556434e-06, |
|
"loss": 1.0991, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2782608695652174, |
|
"grad_norm": 0.6302357316017151, |
|
"learning_rate": 9.610508428570122e-06, |
|
"loss": 1.073, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2811594202898551, |
|
"grad_norm": 0.6086059212684631, |
|
"learning_rate": 9.601520786682989e-06, |
|
"loss": 1.1556, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.28405797101449276, |
|
"grad_norm": 0.5601389408111572, |
|
"learning_rate": 9.592434928729617e-06, |
|
"loss": 1.0691, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.28695652173913044, |
|
"grad_norm": 0.6236623525619507, |
|
"learning_rate": 9.583251048640941e-06, |
|
"loss": 1.0293, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.2898550724637681, |
|
"grad_norm": 0.661264181137085, |
|
"learning_rate": 9.573969342440107e-06, |
|
"loss": 1.0597, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2927536231884058, |
|
"grad_norm": 0.5187559127807617, |
|
"learning_rate": 9.564590008238284e-06, |
|
"loss": 1.0152, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.2956521739130435, |
|
"grad_norm": 0.7033849358558655, |
|
"learning_rate": 9.555113246230443e-06, |
|
"loss": 1.0583, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2985507246376812, |
|
"grad_norm": 0.6243430376052856, |
|
"learning_rate": 9.545539258691076e-06, |
|
"loss": 1.0415, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.30144927536231886, |
|
"grad_norm": 0.7448285222053528, |
|
"learning_rate": 9.535868249969882e-06, |
|
"loss": 1.1665, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.30434782608695654, |
|
"grad_norm": 0.7407688498497009, |
|
"learning_rate": 9.52610042648741e-06, |
|
"loss": 1.0805, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3072463768115942, |
|
"grad_norm": 0.6399569511413574, |
|
"learning_rate": 9.516235996730645e-06, |
|
"loss": 1.0622, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.3101449275362319, |
|
"grad_norm": 0.6391183733940125, |
|
"learning_rate": 9.50627517124856e-06, |
|
"loss": 1.0988, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.3130434782608696, |
|
"grad_norm": 0.6799684166908264, |
|
"learning_rate": 9.496218162647629e-06, |
|
"loss": 1.0667, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.3159420289855073, |
|
"grad_norm": 0.6955932378768921, |
|
"learning_rate": 9.486065185587278e-06, |
|
"loss": 1.0475, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.3188405797101449, |
|
"grad_norm": 0.6768685579299927, |
|
"learning_rate": 9.475816456775313e-06, |
|
"loss": 1.0906, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3217391304347826, |
|
"grad_norm": 0.6448860168457031, |
|
"learning_rate": 9.465472194963287e-06, |
|
"loss": 1.0725, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.32463768115942027, |
|
"grad_norm": 0.654137909412384, |
|
"learning_rate": 9.45503262094184e-06, |
|
"loss": 1.0477, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.32753623188405795, |
|
"grad_norm": 0.5668336749076843, |
|
"learning_rate": 9.444497957535975e-06, |
|
"loss": 1.0419, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.33043478260869563, |
|
"grad_norm": 0.8345162868499756, |
|
"learning_rate": 9.43386842960031e-06, |
|
"loss": 1.1125, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.5995410084724426, |
|
"learning_rate": 9.423144264014278e-06, |
|
"loss": 1.048, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.336231884057971, |
|
"grad_norm": 0.6526032090187073, |
|
"learning_rate": 9.41232568967728e-06, |
|
"loss": 1.0868, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.3391304347826087, |
|
"grad_norm": 0.7131723165512085, |
|
"learning_rate": 9.401412937503802e-06, |
|
"loss": 1.0154, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.34202898550724636, |
|
"grad_norm": 0.7425084114074707, |
|
"learning_rate": 9.39040624041849e-06, |
|
"loss": 1.1046, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.34492753623188405, |
|
"grad_norm": 0.6741538643836975, |
|
"learning_rate": 9.379305833351174e-06, |
|
"loss": 1.0884, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": 0.6611533164978027, |
|
"learning_rate": 9.368111953231849e-06, |
|
"loss": 1.1291, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3507246376811594, |
|
"grad_norm": 0.6605979204177856, |
|
"learning_rate": 9.35682483898563e-06, |
|
"loss": 1.0354, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.3536231884057971, |
|
"grad_norm": 0.7649601101875305, |
|
"learning_rate": 9.345444731527642e-06, |
|
"loss": 1.0705, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3565217391304348, |
|
"grad_norm": 0.6104558110237122, |
|
"learning_rate": 9.333971873757885e-06, |
|
"loss": 1.0221, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.35942028985507246, |
|
"grad_norm": 0.5945985913276672, |
|
"learning_rate": 9.32240651055604e-06, |
|
"loss": 1.0352, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.36231884057971014, |
|
"grad_norm": 0.7351408004760742, |
|
"learning_rate": 9.310748888776254e-06, |
|
"loss": 1.0283, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3652173913043478, |
|
"grad_norm": 0.6751654148101807, |
|
"learning_rate": 9.298999257241862e-06, |
|
"loss": 1.1355, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3681159420289855, |
|
"grad_norm": 0.6744984984397888, |
|
"learning_rate": 9.287157866740082e-06, |
|
"loss": 1.097, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3710144927536232, |
|
"grad_norm": 0.6096031665802002, |
|
"learning_rate": 9.275224970016656e-06, |
|
"loss": 0.9879, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3739130434782609, |
|
"grad_norm": 0.6282311081886292, |
|
"learning_rate": 9.263200821770462e-06, |
|
"loss": 1.0088, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.37681159420289856, |
|
"grad_norm": 0.6340439319610596, |
|
"learning_rate": 9.251085678648072e-06, |
|
"loss": 1.0314, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.37971014492753624, |
|
"grad_norm": 0.6008773446083069, |
|
"learning_rate": 9.238879799238278e-06, |
|
"loss": 1.0304, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.3826086956521739, |
|
"grad_norm": 0.83261638879776, |
|
"learning_rate": 9.22658344406657e-06, |
|
"loss": 1.0767, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3855072463768116, |
|
"grad_norm": 0.6942703127861023, |
|
"learning_rate": 9.214196875589577e-06, |
|
"loss": 1.0238, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3884057971014493, |
|
"grad_norm": 0.6649532914161682, |
|
"learning_rate": 9.201720358189464e-06, |
|
"loss": 1.0353, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.391304347826087, |
|
"grad_norm": 0.6827482581138611, |
|
"learning_rate": 9.189154158168293e-06, |
|
"loss": 1.0123, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.39420289855072466, |
|
"grad_norm": 0.8225923776626587, |
|
"learning_rate": 9.176498543742328e-06, |
|
"loss": 1.0894, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.39710144927536234, |
|
"grad_norm": 0.7622413635253906, |
|
"learning_rate": 9.163753785036324e-06, |
|
"loss": 1.0987, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.729880690574646, |
|
"learning_rate": 9.150920154077753e-06, |
|
"loss": 1.0686, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.4028985507246377, |
|
"grad_norm": 0.5569338798522949, |
|
"learning_rate": 9.137997924791e-06, |
|
"loss": 1.0554, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.4057971014492754, |
|
"grad_norm": 0.7127766013145447, |
|
"learning_rate": 9.124987372991512e-06, |
|
"loss": 1.0878, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.40869565217391307, |
|
"grad_norm": 0.6865119338035583, |
|
"learning_rate": 9.11188877637992e-06, |
|
"loss": 1.078, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.4115942028985507, |
|
"grad_norm": 0.7496594786643982, |
|
"learning_rate": 9.098702414536107e-06, |
|
"loss": 1.1678, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.4144927536231884, |
|
"grad_norm": 0.7547608017921448, |
|
"learning_rate": 9.085428568913233e-06, |
|
"loss": 1.0282, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.41739130434782606, |
|
"grad_norm": 0.6696781516075134, |
|
"learning_rate": 9.072067522831743e-06, |
|
"loss": 1.0529, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.42028985507246375, |
|
"grad_norm": 0.6223747134208679, |
|
"learning_rate": 9.058619561473308e-06, |
|
"loss": 1.0101, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.42318840579710143, |
|
"grad_norm": 0.6682969331741333, |
|
"learning_rate": 9.045084971874738e-06, |
|
"loss": 1.0669, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.4260869565217391, |
|
"grad_norm": 0.702489972114563, |
|
"learning_rate": 9.031464042921866e-06, |
|
"loss": 1.0696, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.4289855072463768, |
|
"grad_norm": 0.6877920031547546, |
|
"learning_rate": 9.017757065343368e-06, |
|
"loss": 1.0181, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.4318840579710145, |
|
"grad_norm": 0.7262343168258667, |
|
"learning_rate": 9.003964331704574e-06, |
|
"loss": 1.0869, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 0.6435033082962036, |
|
"learning_rate": 8.990086136401199e-06, |
|
"loss": 1.0943, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.43768115942028984, |
|
"grad_norm": 0.8294116854667664, |
|
"learning_rate": 8.976122775653087e-06, |
|
"loss": 1.0053, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.4405797101449275, |
|
"grad_norm": 0.7582129240036011, |
|
"learning_rate": 8.96207454749787e-06, |
|
"loss": 1.0255, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.4434782608695652, |
|
"grad_norm": 0.7421862483024597, |
|
"learning_rate": 8.947941751784614e-06, |
|
"loss": 0.995, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.4463768115942029, |
|
"grad_norm": 0.6562067866325378, |
|
"learning_rate": 8.933724690167417e-06, |
|
"loss": 1.0051, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.4492753623188406, |
|
"grad_norm": 0.7008780241012573, |
|
"learning_rate": 8.91942366609897e-06, |
|
"loss": 1.0224, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.45217391304347826, |
|
"grad_norm": 0.8320948481559753, |
|
"learning_rate": 8.905038984824079e-06, |
|
"loss": 1.0867, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.45507246376811594, |
|
"grad_norm": 0.7078688740730286, |
|
"learning_rate": 8.890570953373152e-06, |
|
"loss": 1.0233, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.4579710144927536, |
|
"grad_norm": 0.602080225944519, |
|
"learning_rate": 8.87601988055565e-06, |
|
"loss": 1.033, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.4608695652173913, |
|
"grad_norm": 0.6947946548461914, |
|
"learning_rate": 8.861386076953485e-06, |
|
"loss": 1.0056, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.463768115942029, |
|
"grad_norm": 0.7520703673362732, |
|
"learning_rate": 8.846669854914395e-06, |
|
"loss": 1.0129, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4666666666666667, |
|
"grad_norm": 0.8198053240776062, |
|
"learning_rate": 8.831871528545286e-06, |
|
"loss": 1.0554, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.46956521739130436, |
|
"grad_norm": 0.8595309257507324, |
|
"learning_rate": 8.816991413705515e-06, |
|
"loss": 0.9769, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.47246376811594204, |
|
"grad_norm": 0.7658084034919739, |
|
"learning_rate": 8.802029828000157e-06, |
|
"loss": 1.0942, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.4753623188405797, |
|
"grad_norm": 0.779561460018158, |
|
"learning_rate": 8.786987090773214e-06, |
|
"loss": 1.0526, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4782608695652174, |
|
"grad_norm": 0.7491458654403687, |
|
"learning_rate": 8.771863523100821e-06, |
|
"loss": 1.076, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4811594202898551, |
|
"grad_norm": 0.7698597311973572, |
|
"learning_rate": 8.756659447784367e-06, |
|
"loss": 1.0513, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.48405797101449277, |
|
"grad_norm": 0.7076740860939026, |
|
"learning_rate": 8.741375189343625e-06, |
|
"loss": 0.952, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.48695652173913045, |
|
"grad_norm": 0.8549159169197083, |
|
"learning_rate": 8.726011074009813e-06, |
|
"loss": 1.0062, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.48985507246376814, |
|
"grad_norm": 0.7257103323936462, |
|
"learning_rate": 8.71056742971864e-06, |
|
"loss": 1.0124, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.4927536231884058, |
|
"grad_norm": 0.6643837094306946, |
|
"learning_rate": 8.695044586103297e-06, |
|
"loss": 1.0646, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4956521739130435, |
|
"grad_norm": 0.6454336643218994, |
|
"learning_rate": 8.679442874487427e-06, |
|
"loss": 1.0482, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.4985507246376812, |
|
"grad_norm": 0.6484606266021729, |
|
"learning_rate": 8.663762627878059e-06, |
|
"loss": 1.0361, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.5014492753623189, |
|
"grad_norm": 0.8437646627426147, |
|
"learning_rate": 8.64800418095848e-06, |
|
"loss": 1.1064, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.5043478260869565, |
|
"grad_norm": 0.8865697979927063, |
|
"learning_rate": 8.632167870081122e-06, |
|
"loss": 1.0187, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.5043478260869565, |
|
"eval_loss": 1.0253716707229614, |
|
"eval_runtime": 46.4716, |
|
"eval_samples_per_second": 5.509, |
|
"eval_steps_per_second": 0.689, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.5072463768115942, |
|
"grad_norm": 0.6522702574729919, |
|
"learning_rate": 8.616254033260351e-06, |
|
"loss": 1.0466, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5101449275362319, |
|
"grad_norm": 0.7485548257827759, |
|
"learning_rate": 8.600263010165275e-06, |
|
"loss": 1.051, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.5130434782608696, |
|
"grad_norm": 0.7864269614219666, |
|
"learning_rate": 8.584195142112482e-06, |
|
"loss": 0.9823, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.5159420289855072, |
|
"grad_norm": 0.669228732585907, |
|
"learning_rate": 8.568050772058763e-06, |
|
"loss": 0.9959, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.518840579710145, |
|
"grad_norm": 0.7351509928703308, |
|
"learning_rate": 8.551830244593785e-06, |
|
"loss": 1.0523, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.5217391304347826, |
|
"grad_norm": 0.6464654207229614, |
|
"learning_rate": 8.535533905932739e-06, |
|
"loss": 1.0576, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5246376811594203, |
|
"grad_norm": 0.6708983182907104, |
|
"learning_rate": 8.519162103908951e-06, |
|
"loss": 1.0036, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.527536231884058, |
|
"grad_norm": 0.6712408661842346, |
|
"learning_rate": 8.502715187966455e-06, |
|
"loss": 0.9567, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.5304347826086957, |
|
"grad_norm": 0.8165604472160339, |
|
"learning_rate": 8.48619350915254e-06, |
|
"loss": 1.0074, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.8015124797821045, |
|
"learning_rate": 8.469597420110249e-06, |
|
"loss": 1.04, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.5362318840579711, |
|
"grad_norm": 0.6764898896217346, |
|
"learning_rate": 8.452927275070858e-06, |
|
"loss": 1.0259, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5391304347826087, |
|
"grad_norm": 0.7508796453475952, |
|
"learning_rate": 8.436183429846314e-06, |
|
"loss": 1.0153, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.5420289855072464, |
|
"grad_norm": 0.7400704026222229, |
|
"learning_rate": 8.41936624182164e-06, |
|
"loss": 1.0302, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.5449275362318841, |
|
"grad_norm": 0.7747941017150879, |
|
"learning_rate": 8.402476069947309e-06, |
|
"loss": 1.0516, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.5478260869565217, |
|
"grad_norm": 0.6391712427139282, |
|
"learning_rate": 8.385513274731574e-06, |
|
"loss": 0.9144, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.5507246376811594, |
|
"grad_norm": 0.7723587155342102, |
|
"learning_rate": 8.368478218232787e-06, |
|
"loss": 1.038, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.553623188405797, |
|
"grad_norm": 0.6703996062278748, |
|
"learning_rate": 8.351371264051659e-06, |
|
"loss": 0.9767, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.5565217391304348, |
|
"grad_norm": 0.6496030688285828, |
|
"learning_rate": 8.334192777323508e-06, |
|
"loss": 1.0139, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5594202898550724, |
|
"grad_norm": 0.9179766178131104, |
|
"learning_rate": 8.316943124710457e-06, |
|
"loss": 1.0217, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5623188405797102, |
|
"grad_norm": 0.739105761051178, |
|
"learning_rate": 8.299622674393615e-06, |
|
"loss": 1.0097, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5652173913043478, |
|
"grad_norm": 0.6799715757369995, |
|
"learning_rate": 8.282231796065215e-06, |
|
"loss": 0.9814, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5681159420289855, |
|
"grad_norm": 0.7482266426086426, |
|
"learning_rate": 8.264770860920722e-06, |
|
"loss": 0.9651, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5710144927536231, |
|
"grad_norm": 0.7226840853691101, |
|
"learning_rate": 8.247240241650918e-06, |
|
"loss": 1.0257, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.5739130434782609, |
|
"grad_norm": 0.8682334423065186, |
|
"learning_rate": 8.229640312433938e-06, |
|
"loss": 0.9359, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5768115942028985, |
|
"grad_norm": 0.7574880123138428, |
|
"learning_rate": 8.21197144892728e-06, |
|
"loss": 1.0316, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5797101449275363, |
|
"grad_norm": 0.6719037890434265, |
|
"learning_rate": 8.194234028259806e-06, |
|
"loss": 0.9718, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5826086956521739, |
|
"grad_norm": 0.7872765064239502, |
|
"learning_rate": 8.176428429023674e-06, |
|
"loss": 1.0055, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5855072463768116, |
|
"grad_norm": 0.8982404470443726, |
|
"learning_rate": 8.158555031266255e-06, |
|
"loss": 1.0763, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5884057971014492, |
|
"grad_norm": 0.7265183925628662, |
|
"learning_rate": 8.140614216482046e-06, |
|
"loss": 0.9921, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.591304347826087, |
|
"grad_norm": 0.7971622943878174, |
|
"learning_rate": 8.122606367604497e-06, |
|
"loss": 0.9986, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5942028985507246, |
|
"grad_norm": 0.689160943031311, |
|
"learning_rate": 8.104531868997858e-06, |
|
"loss": 0.9896, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5971014492753624, |
|
"grad_norm": 0.8191243410110474, |
|
"learning_rate": 8.086391106448965e-06, |
|
"loss": 1.0141, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.860882043838501, |
|
"learning_rate": 8.068184467159014e-06, |
|
"loss": 0.9608, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.6028985507246377, |
|
"grad_norm": 0.7216934561729431, |
|
"learning_rate": 8.049912339735284e-06, |
|
"loss": 0.9898, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.6057971014492753, |
|
"grad_norm": 0.685965359210968, |
|
"learning_rate": 8.031575114182856e-06, |
|
"loss": 0.9532, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.6086956521739131, |
|
"grad_norm": 0.6752814054489136, |
|
"learning_rate": 8.013173181896283e-06, |
|
"loss": 1.0043, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6115942028985507, |
|
"grad_norm": 0.815260112285614, |
|
"learning_rate": 7.994706935651228e-06, |
|
"loss": 1.0049, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.6144927536231884, |
|
"grad_norm": 0.729771077632904, |
|
"learning_rate": 7.976176769596095e-06, |
|
"loss": 1.0003, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.6173913043478261, |
|
"grad_norm": 0.6407178044319153, |
|
"learning_rate": 7.957583079243607e-06, |
|
"loss": 1.0197, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.6202898550724638, |
|
"grad_norm": 0.6758530735969543, |
|
"learning_rate": 7.938926261462366e-06, |
|
"loss": 1.0632, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.6231884057971014, |
|
"grad_norm": 0.7678017616271973, |
|
"learning_rate": 7.920206714468383e-06, |
|
"loss": 1.004, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.6260869565217392, |
|
"grad_norm": 0.6864491105079651, |
|
"learning_rate": 7.90142483781658e-06, |
|
"loss": 0.9798, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.6289855072463768, |
|
"grad_norm": 0.7141516804695129, |
|
"learning_rate": 7.882581032392252e-06, |
|
"loss": 0.9969, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.6318840579710145, |
|
"grad_norm": 0.7497020363807678, |
|
"learning_rate": 7.863675700402527e-06, |
|
"loss": 0.9951, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.6347826086956522, |
|
"grad_norm": 0.7010701894760132, |
|
"learning_rate": 7.844709245367766e-06, |
|
"loss": 1.0164, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.6376811594202898, |
|
"grad_norm": 0.8556409478187561, |
|
"learning_rate": 7.82568207211296e-06, |
|
"loss": 1.0079, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6405797101449275, |
|
"grad_norm": 0.8755605816841125, |
|
"learning_rate": 7.806594586759083e-06, |
|
"loss": 1.0401, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.6434782608695652, |
|
"grad_norm": 0.7478286623954773, |
|
"learning_rate": 7.787447196714428e-06, |
|
"loss": 0.9966, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.6463768115942029, |
|
"grad_norm": 0.6972207427024841, |
|
"learning_rate": 7.768240310665909e-06, |
|
"loss": 1.0277, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.6492753623188405, |
|
"grad_norm": 0.7753648161888123, |
|
"learning_rate": 7.748974338570337e-06, |
|
"loss": 1.0531, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.6521739130434783, |
|
"grad_norm": 0.8420187830924988, |
|
"learning_rate": 7.729649691645673e-06, |
|
"loss": 1.0101, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6550724637681159, |
|
"grad_norm": 0.7467186450958252, |
|
"learning_rate": 7.710266782362248e-06, |
|
"loss": 1.086, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.6579710144927536, |
|
"grad_norm": 0.679282009601593, |
|
"learning_rate": 7.69082602443396e-06, |
|
"loss": 1.0756, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.6608695652173913, |
|
"grad_norm": 0.8682421445846558, |
|
"learning_rate": 7.671327832809442e-06, |
|
"loss": 1.0337, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.663768115942029, |
|
"grad_norm": 0.9190111756324768, |
|
"learning_rate": 7.651772623663212e-06, |
|
"loss": 1.0412, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.7419721484184265, |
|
"learning_rate": 7.63216081438678e-06, |
|
"loss": 0.9895, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6695652173913044, |
|
"grad_norm": 0.7735477685928345, |
|
"learning_rate": 7.612492823579744e-06, |
|
"loss": 1.0109, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.672463768115942, |
|
"grad_norm": 0.6718391180038452, |
|
"learning_rate": 7.5927690710408606e-06, |
|
"loss": 1.0699, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6753623188405797, |
|
"grad_norm": 0.8104904890060425, |
|
"learning_rate": 7.572989977759073e-06, |
|
"loss": 0.9957, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.6782608695652174, |
|
"grad_norm": 0.8718286752700806, |
|
"learning_rate": 7.553155965904535e-06, |
|
"loss": 0.9674, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.6811594202898551, |
|
"grad_norm": 0.727627158164978, |
|
"learning_rate": 7.533267458819597e-06, |
|
"loss": 1.0256, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6840579710144927, |
|
"grad_norm": 0.6747854948043823, |
|
"learning_rate": 7.513324881009769e-06, |
|
"loss": 0.9956, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.6869565217391305, |
|
"grad_norm": 0.8896199464797974, |
|
"learning_rate": 7.49332865813466e-06, |
|
"loss": 1.052, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.6898550724637681, |
|
"grad_norm": 0.8011343479156494, |
|
"learning_rate": 7.473279216998896e-06, |
|
"loss": 0.9809, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.6927536231884058, |
|
"grad_norm": 0.7936311960220337, |
|
"learning_rate": 7.453176985543002e-06, |
|
"loss": 0.9491, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 0.783686101436615, |
|
"learning_rate": 7.4330223928342814e-06, |
|
"loss": 1.0627, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6985507246376812, |
|
"grad_norm": 0.6777355670928955, |
|
"learning_rate": 7.412815869057644e-06, |
|
"loss": 0.9836, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.7014492753623188, |
|
"grad_norm": 0.8609856367111206, |
|
"learning_rate": 7.392557845506433e-06, |
|
"loss": 1.0383, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.7043478260869566, |
|
"grad_norm": 0.7346140146255493, |
|
"learning_rate": 7.372248754573213e-06, |
|
"loss": 1.0237, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.7072463768115942, |
|
"grad_norm": 0.8134037852287292, |
|
"learning_rate": 7.351889029740548e-06, |
|
"loss": 1.0051, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.7101449275362319, |
|
"grad_norm": 0.7623313069343567, |
|
"learning_rate": 7.33147910557174e-06, |
|
"loss": 0.966, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.7130434782608696, |
|
"grad_norm": 0.8289423584938049, |
|
"learning_rate": 7.311019417701567e-06, |
|
"loss": 1.0162, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.7159420289855073, |
|
"grad_norm": 0.6778679490089417, |
|
"learning_rate": 7.290510402826967e-06, |
|
"loss": 1.042, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.7188405797101449, |
|
"grad_norm": 0.7705609798431396, |
|
"learning_rate": 7.269952498697734e-06, |
|
"loss": 0.9979, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.7217391304347827, |
|
"grad_norm": 0.8417146801948547, |
|
"learning_rate": 7.249346144107165e-06, |
|
"loss": 0.9937, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.7246376811594203, |
|
"grad_norm": 0.6634312868118286, |
|
"learning_rate": 7.2286917788826926e-06, |
|
"loss": 1.0299, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7275362318840579, |
|
"grad_norm": 0.7162610292434692, |
|
"learning_rate": 7.207989843876505e-06, |
|
"loss": 0.9627, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.7304347826086957, |
|
"grad_norm": 0.886674165725708, |
|
"learning_rate": 7.187240780956133e-06, |
|
"loss": 0.9804, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.7333333333333333, |
|
"grad_norm": 0.8589048385620117, |
|
"learning_rate": 7.166445032995013e-06, |
|
"loss": 0.9972, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.736231884057971, |
|
"grad_norm": 0.792225182056427, |
|
"learning_rate": 7.145603043863045e-06, |
|
"loss": 1.0047, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.7391304347826086, |
|
"grad_norm": 0.7787736654281616, |
|
"learning_rate": 7.124715258417111e-06, |
|
"loss": 0.974, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7420289855072464, |
|
"grad_norm": 0.7716973423957825, |
|
"learning_rate": 7.103782122491577e-06, |
|
"loss": 0.9476, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.744927536231884, |
|
"grad_norm": 0.8235695958137512, |
|
"learning_rate": 7.082804082888787e-06, |
|
"loss": 1.0303, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.7478260869565218, |
|
"grad_norm": 0.8061054944992065, |
|
"learning_rate": 7.061781587369518e-06, |
|
"loss": 1.0254, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.7507246376811594, |
|
"grad_norm": 0.8522235751152039, |
|
"learning_rate": 7.040715084643429e-06, |
|
"loss": 1.0196, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.7536231884057971, |
|
"grad_norm": 0.8005476593971252, |
|
"learning_rate": 7.019605024359475e-06, |
|
"loss": 1.052, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7565217391304347, |
|
"grad_norm": 0.9044481515884399, |
|
"learning_rate": 6.998451857096321e-06, |
|
"loss": 1.04, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.7565217391304347, |
|
"eval_loss": 0.9999631643295288, |
|
"eval_runtime": 46.2792, |
|
"eval_samples_per_second": 5.532, |
|
"eval_steps_per_second": 0.691, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.7594202898550725, |
|
"grad_norm": 0.6946824193000793, |
|
"learning_rate": 6.977256034352713e-06, |
|
"loss": 0.9869, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.7623188405797101, |
|
"grad_norm": 0.8048357963562012, |
|
"learning_rate": 6.956018008537852e-06, |
|
"loss": 0.9773, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.7652173913043478, |
|
"grad_norm": 0.7211609482765198, |
|
"learning_rate": 6.934738232961728e-06, |
|
"loss": 0.9727, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.7681159420289855, |
|
"grad_norm": 0.7225235104560852, |
|
"learning_rate": 6.913417161825449e-06, |
|
"loss": 1.0209, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7710144927536232, |
|
"grad_norm": 0.6443622708320618, |
|
"learning_rate": 6.892055250211552e-06, |
|
"loss": 1.0398, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.7739130434782608, |
|
"grad_norm": 0.8570783138275146, |
|
"learning_rate": 6.8706529540742775e-06, |
|
"loss": 0.9883, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.7768115942028986, |
|
"grad_norm": 0.9808831810951233, |
|
"learning_rate": 6.849210730229846e-06, |
|
"loss": 1.0847, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7797101449275362, |
|
"grad_norm": 0.8551820516586304, |
|
"learning_rate": 6.827729036346706e-06, |
|
"loss": 0.9621, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.782608695652174, |
|
"grad_norm": 0.8964309692382812, |
|
"learning_rate": 6.806208330935766e-06, |
|
"loss": 0.9886, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7855072463768116, |
|
"grad_norm": 0.8737574219703674, |
|
"learning_rate": 6.784649073340601e-06, |
|
"loss": 1.0019, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.7884057971014493, |
|
"grad_norm": 0.7480164170265198, |
|
"learning_rate": 6.763051723727663e-06, |
|
"loss": 0.9987, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7913043478260869, |
|
"grad_norm": 0.7155961990356445, |
|
"learning_rate": 6.741416743076443e-06, |
|
"loss": 1.0043, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.7942028985507247, |
|
"grad_norm": 0.8288201093673706, |
|
"learning_rate": 6.719744593169642e-06, |
|
"loss": 0.9703, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.7971014492753623, |
|
"grad_norm": 0.7403139472007751, |
|
"learning_rate": 6.698035736583307e-06, |
|
"loss": 0.9453, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.7977936863899231, |
|
"learning_rate": 6.67629063667697e-06, |
|
"loss": 1.0091, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.8028985507246377, |
|
"grad_norm": 0.8381959795951843, |
|
"learning_rate": 6.6545097575837405e-06, |
|
"loss": 1.0001, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.8057971014492754, |
|
"grad_norm": 0.7988629937171936, |
|
"learning_rate": 6.6326935642004165e-06, |
|
"loss": 1.0053, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.808695652173913, |
|
"grad_norm": 0.8848451375961304, |
|
"learning_rate": 6.610842522177549e-06, |
|
"loss": 1.021, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.8115942028985508, |
|
"grad_norm": 0.8423268795013428, |
|
"learning_rate": 6.588957097909509e-06, |
|
"loss": 1.0245, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8144927536231884, |
|
"grad_norm": 0.6828733682632446, |
|
"learning_rate": 6.567037758524529e-06, |
|
"loss": 0.9966, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.8173913043478261, |
|
"grad_norm": 0.8118813633918762, |
|
"learning_rate": 6.545084971874738e-06, |
|
"loss": 0.9777, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.8202898550724638, |
|
"grad_norm": 0.8288912773132324, |
|
"learning_rate": 6.5230992065261685e-06, |
|
"loss": 1.0158, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.8231884057971014, |
|
"grad_norm": 0.7110708951950073, |
|
"learning_rate": 6.501080931748764e-06, |
|
"loss": 0.9331, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.8260869565217391, |
|
"grad_norm": 0.767749011516571, |
|
"learning_rate": 6.4790306175063535e-06, |
|
"loss": 0.8917, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.8289855072463768, |
|
"grad_norm": 0.8519418835639954, |
|
"learning_rate": 6.456948734446624e-06, |
|
"loss": 1.0296, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.8318840579710145, |
|
"grad_norm": 0.7988749742507935, |
|
"learning_rate": 6.43483575389108e-06, |
|
"loss": 0.9296, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.8347826086956521, |
|
"grad_norm": 0.8312949538230896, |
|
"learning_rate": 6.412692147824976e-06, |
|
"loss": 1.0632, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.8376811594202899, |
|
"grad_norm": 0.9024953246116638, |
|
"learning_rate": 6.390518388887246e-06, |
|
"loss": 1.0013, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.8405797101449275, |
|
"grad_norm": 0.6774289011955261, |
|
"learning_rate": 6.368314950360416e-06, |
|
"loss": 0.954, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8434782608695652, |
|
"grad_norm": 0.739329993724823, |
|
"learning_rate": 6.3460823061604984e-06, |
|
"loss": 0.9453, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.8463768115942029, |
|
"grad_norm": 0.7888621687889099, |
|
"learning_rate": 6.323820930826879e-06, |
|
"loss": 0.9672, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.8492753623188406, |
|
"grad_norm": 0.7777626514434814, |
|
"learning_rate": 6.301531299512195e-06, |
|
"loss": 1.0118, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.8521739130434782, |
|
"grad_norm": 0.8532302975654602, |
|
"learning_rate": 6.279213887972179e-06, |
|
"loss": 0.9837, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.855072463768116, |
|
"grad_norm": 0.8223821520805359, |
|
"learning_rate": 6.2568691725555144e-06, |
|
"loss": 0.9786, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8579710144927536, |
|
"grad_norm": 0.7102084755897522, |
|
"learning_rate": 6.234497630193666e-06, |
|
"loss": 0.9634, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.8608695652173913, |
|
"grad_norm": 0.7488099932670593, |
|
"learning_rate": 6.2120997383907015e-06, |
|
"loss": 1.0271, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.863768115942029, |
|
"grad_norm": 0.755387008190155, |
|
"learning_rate": 6.189675975213094e-06, |
|
"loss": 1.0068, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.8666666666666667, |
|
"grad_norm": 0.7323296666145325, |
|
"learning_rate": 6.1672268192795285e-06, |
|
"loss": 1.0177, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 0.7505559325218201, |
|
"learning_rate": 6.144752749750671e-06, |
|
"loss": 1.0031, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8724637681159421, |
|
"grad_norm": 0.8251679539680481, |
|
"learning_rate": 6.122254246318957e-06, |
|
"loss": 1.0281, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.8753623188405797, |
|
"grad_norm": 0.7030305862426758, |
|
"learning_rate": 6.099731789198344e-06, |
|
"loss": 0.977, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.8782608695652174, |
|
"grad_norm": 0.872175931930542, |
|
"learning_rate": 6.077185859114059e-06, |
|
"loss": 1.0279, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.881159420289855, |
|
"grad_norm": 0.6906105279922485, |
|
"learning_rate": 6.05461693729235e-06, |
|
"loss": 0.9747, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.8840579710144928, |
|
"grad_norm": 0.8041731119155884, |
|
"learning_rate": 6.0320255054501985e-06, |
|
"loss": 0.9706, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.8869565217391304, |
|
"grad_norm": 0.9219099283218384, |
|
"learning_rate": 6.009412045785051e-06, |
|
"loss": 1.0192, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.8898550724637682, |
|
"grad_norm": 0.5931650996208191, |
|
"learning_rate": 5.986777040964521e-06, |
|
"loss": 1.0064, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.8927536231884058, |
|
"grad_norm": 0.9496859908103943, |
|
"learning_rate": 5.964120974116085e-06, |
|
"loss": 1.0138, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8956521739130435, |
|
"grad_norm": 0.719667375087738, |
|
"learning_rate": 5.941444328816775e-06, |
|
"loss": 1.0213, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.8985507246376812, |
|
"grad_norm": 0.8299076557159424, |
|
"learning_rate": 5.918747589082853e-06, |
|
"loss": 0.9931, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9014492753623189, |
|
"grad_norm": 0.8233078718185425, |
|
"learning_rate": 5.896031239359485e-06, |
|
"loss": 0.9789, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.9043478260869565, |
|
"grad_norm": 0.6814295649528503, |
|
"learning_rate": 5.8732957645103946e-06, |
|
"loss": 1.0711, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.9072463768115943, |
|
"grad_norm": 0.786590039730072, |
|
"learning_rate": 5.85054164980752e-06, |
|
"loss": 1.0282, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.9101449275362319, |
|
"grad_norm": 0.7114934921264648, |
|
"learning_rate": 5.82776938092065e-06, |
|
"loss": 1.0125, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.9130434782608695, |
|
"grad_norm": 0.8856657147407532, |
|
"learning_rate": 5.804979443907065e-06, |
|
"loss": 1.0325, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.9159420289855073, |
|
"grad_norm": 0.9123273491859436, |
|
"learning_rate": 5.782172325201155e-06, |
|
"loss": 1.0696, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.9188405797101449, |
|
"grad_norm": 0.7296032905578613, |
|
"learning_rate": 5.7593485116040425e-06, |
|
"loss": 1.0004, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.9217391304347826, |
|
"grad_norm": 0.8410807847976685, |
|
"learning_rate": 5.736508490273189e-06, |
|
"loss": 0.9547, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.9246376811594202, |
|
"grad_norm": 1.0709190368652344, |
|
"learning_rate": 5.713652748711997e-06, |
|
"loss": 0.9583, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.927536231884058, |
|
"grad_norm": 0.6270896196365356, |
|
"learning_rate": 5.690781774759412e-06, |
|
"loss": 1.0024, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9304347826086956, |
|
"grad_norm": 0.7849041223526001, |
|
"learning_rate": 5.667896056579495e-06, |
|
"loss": 0.9477, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 0.7513189315795898, |
|
"learning_rate": 5.644996082651018e-06, |
|
"loss": 0.9937, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.936231884057971, |
|
"grad_norm": 0.8150386214256287, |
|
"learning_rate": 5.622082341757027e-06, |
|
"loss": 1.0589, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.9391304347826087, |
|
"grad_norm": 0.8518944978713989, |
|
"learning_rate": 5.5991553229744166e-06, |
|
"loss": 1.0393, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.9420289855072463, |
|
"grad_norm": 0.814802885055542, |
|
"learning_rate": 5.576215515663489e-06, |
|
"loss": 1.0186, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.9449275362318841, |
|
"grad_norm": 0.9456635117530823, |
|
"learning_rate": 5.553263409457504e-06, |
|
"loss": 0.9657, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.9478260869565217, |
|
"grad_norm": 0.7259712815284729, |
|
"learning_rate": 5.530299494252238e-06, |
|
"loss": 1.0066, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.9507246376811594, |
|
"grad_norm": 0.7462155818939209, |
|
"learning_rate": 5.507324260195516e-06, |
|
"loss": 0.9246, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.9536231884057971, |
|
"grad_norm": 0.9022188782691956, |
|
"learning_rate": 5.484338197676757e-06, |
|
"loss": 0.9624, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.9565217391304348, |
|
"grad_norm": 0.8874835968017578, |
|
"learning_rate": 5.46134179731651e-06, |
|
"loss": 0.9851, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9594202898550724, |
|
"grad_norm": 0.7534209489822388, |
|
"learning_rate": 5.4383355499559734e-06, |
|
"loss": 0.9761, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.9623188405797102, |
|
"grad_norm": 0.9121699929237366, |
|
"learning_rate": 5.41531994664652e-06, |
|
"loss": 0.9994, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.9652173913043478, |
|
"grad_norm": 0.774753212928772, |
|
"learning_rate": 5.392295478639226e-06, |
|
"loss": 1.0218, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.9681159420289855, |
|
"grad_norm": 0.7575943470001221, |
|
"learning_rate": 5.36926263737437e-06, |
|
"loss": 0.9855, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.9710144927536232, |
|
"grad_norm": 0.8202754259109497, |
|
"learning_rate": 5.346221914470959e-06, |
|
"loss": 1.0112, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9739130434782609, |
|
"grad_norm": 0.8952569961547852, |
|
"learning_rate": 5.323173801716222e-06, |
|
"loss": 0.9722, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.9768115942028985, |
|
"grad_norm": 0.7153046727180481, |
|
"learning_rate": 5.300118791055122e-06, |
|
"loss": 0.9847, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.9797101449275363, |
|
"grad_norm": 0.7900391221046448, |
|
"learning_rate": 5.27705737457985e-06, |
|
"loss": 1.0324, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.9826086956521739, |
|
"grad_norm": 0.8250629305839539, |
|
"learning_rate": 5.253990044519329e-06, |
|
"loss": 0.9764, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.9855072463768116, |
|
"grad_norm": 0.8809992671012878, |
|
"learning_rate": 5.230917293228699e-06, |
|
"loss": 1.0198, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9884057971014493, |
|
"grad_norm": 0.7209755778312683, |
|
"learning_rate": 5.207839613178814e-06, |
|
"loss": 1.0253, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.991304347826087, |
|
"grad_norm": 0.8488002419471741, |
|
"learning_rate": 5.184757496945726e-06, |
|
"loss": 0.9333, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.9942028985507246, |
|
"grad_norm": 0.8114776611328125, |
|
"learning_rate": 5.161671437200179e-06, |
|
"loss": 1.0026, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.9971014492753624, |
|
"grad_norm": 0.8550688028335571, |
|
"learning_rate": 5.138581926697083e-06, |
|
"loss": 1.0057, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.9187963008880615, |
|
"learning_rate": 5.115489458265006e-06, |
|
"loss": 1.0037, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.0028985507246377, |
|
"grad_norm": 0.8499656915664673, |
|
"learning_rate": 5.09239452479565e-06, |
|
"loss": 0.9793, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.0057971014492753, |
|
"grad_norm": 0.9663048982620239, |
|
"learning_rate": 5.0692976192333295e-06, |
|
"loss": 0.9337, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.008695652173913, |
|
"grad_norm": 0.8095614910125732, |
|
"learning_rate": 5.046199234564455e-06, |
|
"loss": 0.9461, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.008695652173913, |
|
"eval_loss": 0.9858289361000061, |
|
"eval_runtime": 46.4396, |
|
"eval_samples_per_second": 5.513, |
|
"eval_steps_per_second": 0.689, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.0115942028985507, |
|
"grad_norm": 0.839413046836853, |
|
"learning_rate": 5.0230998638070024e-06, |
|
"loss": 0.9702, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.0144927536231885, |
|
"grad_norm": 0.8220239877700806, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9403, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.017391304347826, |
|
"grad_norm": 0.8942255973815918, |
|
"learning_rate": 4.976900136192998e-06, |
|
"loss": 0.9763, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.0028985507246377, |
|
"grad_norm": 0.785389244556427, |
|
"learning_rate": 4.953800765435547e-06, |
|
"loss": 1.0033, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.0057971014492753, |
|
"grad_norm": 0.9310470223426819, |
|
"learning_rate": 4.930702380766671e-06, |
|
"loss": 0.9569, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.008695652173913, |
|
"grad_norm": 0.9420292377471924, |
|
"learning_rate": 4.907605475204352e-06, |
|
"loss": 1.0085, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.0115942028985507, |
|
"grad_norm": 0.8762017488479614, |
|
"learning_rate": 4.8845105417349955e-06, |
|
"loss": 1.0225, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.0144927536231885, |
|
"grad_norm": 0.8962522149085999, |
|
"learning_rate": 4.861418073302919e-06, |
|
"loss": 0.9543, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.017391304347826, |
|
"grad_norm": 0.8070088028907776, |
|
"learning_rate": 4.838328562799824e-06, |
|
"loss": 0.9334, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.0202898550724637, |
|
"grad_norm": 0.8407843708992004, |
|
"learning_rate": 4.815242503054277e-06, |
|
"loss": 0.9499, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.0231884057971015, |
|
"grad_norm": 0.8197099566459656, |
|
"learning_rate": 4.79216038682119e-06, |
|
"loss": 1.0039, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.0260869565217392, |
|
"grad_norm": 0.7919727563858032, |
|
"learning_rate": 4.7690827067713035e-06, |
|
"loss": 0.9731, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0289855072463767, |
|
"grad_norm": 0.7514965534210205, |
|
"learning_rate": 4.746009955480672e-06, |
|
"loss": 0.9124, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.0318840579710145, |
|
"grad_norm": 0.7958142757415771, |
|
"learning_rate": 4.7229426254201504e-06, |
|
"loss": 0.9836, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.0347826086956522, |
|
"grad_norm": 0.9223296642303467, |
|
"learning_rate": 4.69988120894488e-06, |
|
"loss": 1.0372, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.03768115942029, |
|
"grad_norm": 0.7448701858520508, |
|
"learning_rate": 4.676826198283779e-06, |
|
"loss": 0.9189, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.0405797101449274, |
|
"grad_norm": 0.731107771396637, |
|
"learning_rate": 4.653778085529043e-06, |
|
"loss": 0.9632, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.0434782608695652, |
|
"grad_norm": 0.8460220694541931, |
|
"learning_rate": 4.630737362625631e-06, |
|
"loss": 0.9794, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.046376811594203, |
|
"grad_norm": 0.8166036605834961, |
|
"learning_rate": 4.6077045213607765e-06, |
|
"loss": 0.9976, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.0492753623188407, |
|
"grad_norm": 0.6962491869926453, |
|
"learning_rate": 4.584680053353481e-06, |
|
"loss": 0.9374, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.0521739130434782, |
|
"grad_norm": 0.8353239893913269, |
|
"learning_rate": 4.561664450044029e-06, |
|
"loss": 0.991, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.055072463768116, |
|
"grad_norm": 0.8190463781356812, |
|
"learning_rate": 4.53865820268349e-06, |
|
"loss": 0.9971, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0579710144927537, |
|
"grad_norm": 0.904393196105957, |
|
"learning_rate": 4.515661802323244e-06, |
|
"loss": 0.9548, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.0608695652173914, |
|
"grad_norm": 0.7582879066467285, |
|
"learning_rate": 4.492675739804486e-06, |
|
"loss": 0.934, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.063768115942029, |
|
"grad_norm": 0.7787836194038391, |
|
"learning_rate": 4.4697005057477634e-06, |
|
"loss": 0.973, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 0.7273504137992859, |
|
"learning_rate": 4.446736590542497e-06, |
|
"loss": 1.0166, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.0695652173913044, |
|
"grad_norm": 0.7512848377227783, |
|
"learning_rate": 4.4237844843365126e-06, |
|
"loss": 0.9951, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.0724637681159421, |
|
"grad_norm": 0.8715952038764954, |
|
"learning_rate": 4.400844677025585e-06, |
|
"loss": 1.0384, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.0753623188405796, |
|
"grad_norm": 1.1643601655960083, |
|
"learning_rate": 4.377917658242975e-06, |
|
"loss": 0.9725, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.0782608695652174, |
|
"grad_norm": 1.0170421600341797, |
|
"learning_rate": 4.355003917348985e-06, |
|
"loss": 0.9877, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.0811594202898551, |
|
"grad_norm": 0.8441584706306458, |
|
"learning_rate": 4.332103943420507e-06, |
|
"loss": 0.9795, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.0840579710144929, |
|
"grad_norm": 0.9508838057518005, |
|
"learning_rate": 4.309218225240591e-06, |
|
"loss": 1.0274, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0869565217391304, |
|
"grad_norm": 0.9078054428100586, |
|
"learning_rate": 4.286347251288004e-06, |
|
"loss": 1.0117, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.0898550724637681, |
|
"grad_norm": 1.056804895401001, |
|
"learning_rate": 4.263491509726812e-06, |
|
"loss": 0.9588, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.0927536231884059, |
|
"grad_norm": 0.8957586288452148, |
|
"learning_rate": 4.240651488395958e-06, |
|
"loss": 0.9644, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.0956521739130434, |
|
"grad_norm": 0.9251319169998169, |
|
"learning_rate": 4.217827674798845e-06, |
|
"loss": 0.9764, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.098550724637681, |
|
"grad_norm": 0.8325505256652832, |
|
"learning_rate": 4.195020556092935e-06, |
|
"loss": 0.987, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.1014492753623188, |
|
"grad_norm": 0.8144704699516296, |
|
"learning_rate": 4.17223061907935e-06, |
|
"loss": 0.9898, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.1043478260869566, |
|
"grad_norm": 0.8545647859573364, |
|
"learning_rate": 4.14945835019248e-06, |
|
"loss": 0.9214, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.107246376811594, |
|
"grad_norm": 0.8896581530570984, |
|
"learning_rate": 4.126704235489606e-06, |
|
"loss": 0.9432, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.1101449275362318, |
|
"grad_norm": 0.8762820959091187, |
|
"learning_rate": 4.103968760640516e-06, |
|
"loss": 0.9754, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.1130434782608696, |
|
"grad_norm": 0.7869084477424622, |
|
"learning_rate": 4.081252410917148e-06, |
|
"loss": 0.9655, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.1159420289855073, |
|
"grad_norm": 0.9484694600105286, |
|
"learning_rate": 4.058555671183227e-06, |
|
"loss": 0.9461, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.1188405797101448, |
|
"grad_norm": 0.8366033434867859, |
|
"learning_rate": 4.035879025883916e-06, |
|
"loss": 0.9745, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.1217391304347826, |
|
"grad_norm": 0.8974631428718567, |
|
"learning_rate": 4.013222959035481e-06, |
|
"loss": 1.003, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.1246376811594203, |
|
"grad_norm": 0.9970961809158325, |
|
"learning_rate": 3.99058795421495e-06, |
|
"loss": 0.9548, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.127536231884058, |
|
"grad_norm": 0.8342113494873047, |
|
"learning_rate": 3.967974494549803e-06, |
|
"loss": 0.8879, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.1304347826086956, |
|
"grad_norm": 0.7740679383277893, |
|
"learning_rate": 3.945383062707652e-06, |
|
"loss": 1.0181, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.1333333333333333, |
|
"grad_norm": 0.8080225586891174, |
|
"learning_rate": 3.922814140885942e-06, |
|
"loss": 0.9629, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.136231884057971, |
|
"grad_norm": 0.745694637298584, |
|
"learning_rate": 3.9002682108016585e-06, |
|
"loss": 0.9725, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.1391304347826088, |
|
"grad_norm": 0.93767249584198, |
|
"learning_rate": 3.8777457536810446e-06, |
|
"loss": 0.9411, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.1420289855072463, |
|
"grad_norm": 0.7331735491752625, |
|
"learning_rate": 3.855247250249331e-06, |
|
"loss": 0.9187, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.144927536231884, |
|
"grad_norm": 1.1504460573196411, |
|
"learning_rate": 3.832773180720475e-06, |
|
"loss": 1.0038, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.1478260869565218, |
|
"grad_norm": 0.7792490124702454, |
|
"learning_rate": 3.8103240247869077e-06, |
|
"loss": 0.9583, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.1507246376811595, |
|
"grad_norm": 0.8607194423675537, |
|
"learning_rate": 3.7879002616093015e-06, |
|
"loss": 0.9608, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.153623188405797, |
|
"grad_norm": 0.7470278143882751, |
|
"learning_rate": 3.765502369806334e-06, |
|
"loss": 1.0097, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.1565217391304348, |
|
"grad_norm": 0.8549491763114929, |
|
"learning_rate": 3.743130827444487e-06, |
|
"loss": 0.9707, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.1594202898550725, |
|
"grad_norm": 0.8472537398338318, |
|
"learning_rate": 3.720786112027822e-06, |
|
"loss": 0.9746, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.1623188405797102, |
|
"grad_norm": 0.7988584637641907, |
|
"learning_rate": 3.6984687004878052e-06, |
|
"loss": 0.9883, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.1652173913043478, |
|
"grad_norm": 0.823165774345398, |
|
"learning_rate": 3.6761790691731207e-06, |
|
"loss": 1.013, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.1681159420289855, |
|
"grad_norm": 0.7537344694137573, |
|
"learning_rate": 3.6539176938395037e-06, |
|
"loss": 1.0081, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.1710144927536232, |
|
"grad_norm": 0.7858260273933411, |
|
"learning_rate": 3.6316850496395863e-06, |
|
"loss": 0.9688, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.1739130434782608, |
|
"grad_norm": 0.8715892434120178, |
|
"learning_rate": 3.609481611112755e-06, |
|
"loss": 1.0181, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.1768115942028985, |
|
"grad_norm": 0.816693127155304, |
|
"learning_rate": 3.587307852175025e-06, |
|
"loss": 0.9505, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.1797101449275362, |
|
"grad_norm": 0.9773905277252197, |
|
"learning_rate": 3.5651642461089207e-06, |
|
"loss": 0.9745, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.182608695652174, |
|
"grad_norm": 0.7822540998458862, |
|
"learning_rate": 3.5430512655533774e-06, |
|
"loss": 0.9977, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.1855072463768117, |
|
"grad_norm": 0.9197254180908203, |
|
"learning_rate": 3.5209693824936486e-06, |
|
"loss": 0.9955, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.1884057971014492, |
|
"grad_norm": 0.8545462489128113, |
|
"learning_rate": 3.498919068251237e-06, |
|
"loss": 1.0544, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.191304347826087, |
|
"grad_norm": 0.8395746350288391, |
|
"learning_rate": 3.476900793473832e-06, |
|
"loss": 0.9757, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.1942028985507247, |
|
"grad_norm": 0.8740842938423157, |
|
"learning_rate": 3.4549150281252635e-06, |
|
"loss": 0.9468, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.1971014492753622, |
|
"grad_norm": 0.7521042823791504, |
|
"learning_rate": 3.4329622414754728e-06, |
|
"loss": 0.9432, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.713711142539978, |
|
"learning_rate": 3.4110429020904924e-06, |
|
"loss": 0.9838, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2028985507246377, |
|
"grad_norm": 0.8481893539428711, |
|
"learning_rate": 3.3891574778224524e-06, |
|
"loss": 0.9489, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.2057971014492754, |
|
"grad_norm": 0.863029420375824, |
|
"learning_rate": 3.3673064357995844e-06, |
|
"loss": 1.0462, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.208695652173913, |
|
"grad_norm": 0.8649914860725403, |
|
"learning_rate": 3.3454902424162603e-06, |
|
"loss": 1.0085, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.2115942028985507, |
|
"grad_norm": 0.8374588489532471, |
|
"learning_rate": 3.3237093633230323e-06, |
|
"loss": 1.0425, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.2144927536231884, |
|
"grad_norm": 0.9396947026252747, |
|
"learning_rate": 3.301964263416693e-06, |
|
"loss": 1.0303, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.2173913043478262, |
|
"grad_norm": 0.8101410865783691, |
|
"learning_rate": 3.2802554068303595e-06, |
|
"loss": 0.9747, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.2202898550724637, |
|
"grad_norm": 0.9860018491744995, |
|
"learning_rate": 3.2585832569235576e-06, |
|
"loss": 0.9533, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.2231884057971014, |
|
"grad_norm": 0.950383186340332, |
|
"learning_rate": 3.236948276272337e-06, |
|
"loss": 0.9562, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.2260869565217392, |
|
"grad_norm": 0.8197913765907288, |
|
"learning_rate": 3.2153509266593984e-06, |
|
"loss": 0.9588, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.228985507246377, |
|
"grad_norm": 0.8033617734909058, |
|
"learning_rate": 3.1937916690642356e-06, |
|
"loss": 1.0014, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.2318840579710144, |
|
"grad_norm": 0.8451259732246399, |
|
"learning_rate": 3.1722709636532944e-06, |
|
"loss": 0.9428, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.2347826086956522, |
|
"grad_norm": 0.7560276985168457, |
|
"learning_rate": 3.150789269770155e-06, |
|
"loss": 1.002, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.23768115942029, |
|
"grad_norm": 0.918804943561554, |
|
"learning_rate": 3.1293470459257237e-06, |
|
"loss": 0.9653, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.2405797101449276, |
|
"grad_norm": 0.8339065313339233, |
|
"learning_rate": 3.107944749788449e-06, |
|
"loss": 0.9407, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.2434782608695651, |
|
"grad_norm": 0.7564199566841125, |
|
"learning_rate": 3.0865828381745515e-06, |
|
"loss": 1.012, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.2434782608695651, |
|
"eval_loss": 0.9773865938186646, |
|
"eval_runtime": 46.2701, |
|
"eval_samples_per_second": 5.533, |
|
"eval_steps_per_second": 0.692, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.2463768115942029, |
|
"grad_norm": 0.7768362164497375, |
|
"learning_rate": 3.0652617670382745e-06, |
|
"loss": 0.9642, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.2492753623188406, |
|
"grad_norm": 0.8295703530311584, |
|
"learning_rate": 3.04398199146215e-06, |
|
"loss": 1.0002, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.2521739130434781, |
|
"grad_norm": 0.8403414487838745, |
|
"learning_rate": 3.0227439656472878e-06, |
|
"loss": 0.9772, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.2550724637681159, |
|
"grad_norm": 0.8178934454917908, |
|
"learning_rate": 3.0015481429036807e-06, |
|
"loss": 1.0126, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.2579710144927536, |
|
"grad_norm": 0.8231812119483948, |
|
"learning_rate": 2.980394975640526e-06, |
|
"loss": 0.9118, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.2608695652173914, |
|
"grad_norm": 0.8780835270881653, |
|
"learning_rate": 2.9592849153565727e-06, |
|
"loss": 0.9549, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.263768115942029, |
|
"grad_norm": 1.000675916671753, |
|
"learning_rate": 2.9382184126304834e-06, |
|
"loss": 1.0483, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.2666666666666666, |
|
"grad_norm": 0.8840986490249634, |
|
"learning_rate": 2.917195917111215e-06, |
|
"loss": 0.9931, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.2695652173913043, |
|
"grad_norm": 0.8707259297370911, |
|
"learning_rate": 2.8962178775084267e-06, |
|
"loss": 0.8975, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.272463768115942, |
|
"grad_norm": 0.7439221739768982, |
|
"learning_rate": 2.8752847415828923e-06, |
|
"loss": 0.9453, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.2753623188405796, |
|
"grad_norm": 0.9899610280990601, |
|
"learning_rate": 2.8543969561369556e-06, |
|
"loss": 0.9426, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.2782608695652173, |
|
"grad_norm": 0.9144057035446167, |
|
"learning_rate": 2.8335549670049866e-06, |
|
"loss": 0.9453, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.281159420289855, |
|
"grad_norm": 0.9034680128097534, |
|
"learning_rate": 2.812759219043869e-06, |
|
"loss": 0.9258, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.2840579710144928, |
|
"grad_norm": 0.9689735174179077, |
|
"learning_rate": 2.7920101561234954e-06, |
|
"loss": 0.993, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.2869565217391306, |
|
"grad_norm": 0.6610868573188782, |
|
"learning_rate": 2.771308221117309e-06, |
|
"loss": 0.9506, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.289855072463768, |
|
"grad_norm": 0.829849362373352, |
|
"learning_rate": 2.750653855892836e-06, |
|
"loss": 0.9609, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.2927536231884058, |
|
"grad_norm": 0.7730438709259033, |
|
"learning_rate": 2.7300475013022666e-06, |
|
"loss": 0.9859, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.2956521739130435, |
|
"grad_norm": 0.925363302230835, |
|
"learning_rate": 2.7094895971730326e-06, |
|
"loss": 1.0286, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.298550724637681, |
|
"grad_norm": 0.886048436164856, |
|
"learning_rate": 2.6889805822984348e-06, |
|
"loss": 0.952, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.3014492753623188, |
|
"grad_norm": 1.1092323064804077, |
|
"learning_rate": 2.668520894428259e-06, |
|
"loss": 1.0032, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.3043478260869565, |
|
"grad_norm": 0.7811794877052307, |
|
"learning_rate": 2.648110970259454e-06, |
|
"loss": 0.9296, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.3072463768115943, |
|
"grad_norm": 0.8023120164871216, |
|
"learning_rate": 2.6277512454267874e-06, |
|
"loss": 0.9304, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.310144927536232, |
|
"grad_norm": 0.7649518251419067, |
|
"learning_rate": 2.607442154493568e-06, |
|
"loss": 0.9441, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.3130434782608695, |
|
"grad_norm": 0.8725413680076599, |
|
"learning_rate": 2.5871841309423557e-06, |
|
"loss": 0.9637, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.3159420289855073, |
|
"grad_norm": 0.7210862636566162, |
|
"learning_rate": 2.5669776071657194e-06, |
|
"loss": 0.9869, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.318840579710145, |
|
"grad_norm": 0.8270391821861267, |
|
"learning_rate": 2.546823014456998e-06, |
|
"loss": 0.9164, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.3217391304347825, |
|
"grad_norm": 0.829223096370697, |
|
"learning_rate": 2.526720783001107e-06, |
|
"loss": 1.0128, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.3246376811594203, |
|
"grad_norm": 0.9681026935577393, |
|
"learning_rate": 2.506671341865341e-06, |
|
"loss": 0.9768, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.327536231884058, |
|
"grad_norm": 0.840314507484436, |
|
"learning_rate": 2.486675118990233e-06, |
|
"loss": 0.9359, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.3304347826086955, |
|
"grad_norm": 0.659677267074585, |
|
"learning_rate": 2.466732541180404e-06, |
|
"loss": 0.965, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.9055850505828857, |
|
"learning_rate": 2.4468440340954664e-06, |
|
"loss": 0.9557, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.336231884057971, |
|
"grad_norm": 0.8318009972572327, |
|
"learning_rate": 2.4270100222409275e-06, |
|
"loss": 0.9111, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.3391304347826087, |
|
"grad_norm": 0.9112004041671753, |
|
"learning_rate": 2.4072309289591394e-06, |
|
"loss": 0.9243, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.3420289855072465, |
|
"grad_norm": 0.8032493591308594, |
|
"learning_rate": 2.387507176420256e-06, |
|
"loss": 0.9228, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.344927536231884, |
|
"grad_norm": 0.662981390953064, |
|
"learning_rate": 2.3678391856132203e-06, |
|
"loss": 0.9778, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.3478260869565217, |
|
"grad_norm": 0.8368533849716187, |
|
"learning_rate": 2.348227376336789e-06, |
|
"loss": 1.0145, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.3507246376811595, |
|
"grad_norm": 0.9046915769577026, |
|
"learning_rate": 2.328672167190558e-06, |
|
"loss": 0.9393, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.353623188405797, |
|
"grad_norm": 0.9030489921569824, |
|
"learning_rate": 2.3091739755660425e-06, |
|
"loss": 0.9636, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.3565217391304347, |
|
"grad_norm": 0.8339246511459351, |
|
"learning_rate": 2.289733217637753e-06, |
|
"loss": 0.9395, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.3594202898550725, |
|
"grad_norm": 0.7877910733222961, |
|
"learning_rate": 2.2703503083543288e-06, |
|
"loss": 0.9454, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.3623188405797102, |
|
"grad_norm": 0.9808143377304077, |
|
"learning_rate": 2.2510256614296638e-06, |
|
"loss": 0.9968, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.365217391304348, |
|
"grad_norm": 1.2518080472946167, |
|
"learning_rate": 2.2317596893340924e-06, |
|
"loss": 0.9732, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.3681159420289855, |
|
"grad_norm": 0.8053367137908936, |
|
"learning_rate": 2.2125528032855727e-06, |
|
"loss": 0.9803, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.3710144927536232, |
|
"grad_norm": 0.9491231441497803, |
|
"learning_rate": 2.1934054132409183e-06, |
|
"loss": 0.9332, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.373913043478261, |
|
"grad_norm": 0.7503049373626709, |
|
"learning_rate": 2.174317927887041e-06, |
|
"loss": 0.9591, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.3768115942028984, |
|
"grad_norm": 0.819608211517334, |
|
"learning_rate": 2.1552907546322356e-06, |
|
"loss": 0.9795, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.3797101449275362, |
|
"grad_norm": 0.8053436279296875, |
|
"learning_rate": 2.136324299597474e-06, |
|
"loss": 1.0053, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.382608695652174, |
|
"grad_norm": 0.7377948760986328, |
|
"learning_rate": 2.11741896760775e-06, |
|
"loss": 1.0277, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.3855072463768117, |
|
"grad_norm": 0.865705668926239, |
|
"learning_rate": 2.098575162183422e-06, |
|
"loss": 0.9952, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.3884057971014494, |
|
"grad_norm": 0.8623892664909363, |
|
"learning_rate": 2.0797932855316183e-06, |
|
"loss": 1.0304, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.391304347826087, |
|
"grad_norm": 0.803113579750061, |
|
"learning_rate": 2.061073738537635e-06, |
|
"loss": 0.993, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.3942028985507247, |
|
"grad_norm": 0.7748633623123169, |
|
"learning_rate": 2.0424169207563954e-06, |
|
"loss": 0.9103, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 1.3971014492753624, |
|
"grad_norm": 0.9022510051727295, |
|
"learning_rate": 2.023823230403907e-06, |
|
"loss": 0.9125, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.8588757514953613, |
|
"learning_rate": 2.005293064348773e-06, |
|
"loss": 1.0259, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.4028985507246376, |
|
"grad_norm": 0.8985849618911743, |
|
"learning_rate": 1.9868268181037186e-06, |
|
"loss": 0.9839, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.4057971014492754, |
|
"grad_norm": 0.8959106802940369, |
|
"learning_rate": 1.968424885817143e-06, |
|
"loss": 0.9752, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 1.4086956521739131, |
|
"grad_norm": 0.9213183522224426, |
|
"learning_rate": 1.9500876602647167e-06, |
|
"loss": 0.9053, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.4115942028985506, |
|
"grad_norm": 0.8219558596611023, |
|
"learning_rate": 1.931815532840987e-06, |
|
"loss": 0.9522, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 1.4144927536231884, |
|
"grad_norm": 0.8716898560523987, |
|
"learning_rate": 1.913608893551036e-06, |
|
"loss": 0.9858, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.4173913043478261, |
|
"grad_norm": 0.9072102904319763, |
|
"learning_rate": 1.8954681310021434e-06, |
|
"loss": 0.9382, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.4202898550724639, |
|
"grad_norm": 0.8592570424079895, |
|
"learning_rate": 1.8773936323955055e-06, |
|
"loss": 1.0004, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.4231884057971014, |
|
"grad_norm": 0.8882102966308594, |
|
"learning_rate": 1.8593857835179557e-06, |
|
"loss": 0.9862, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 1.4260869565217391, |
|
"grad_norm": 0.851216197013855, |
|
"learning_rate": 1.8414449687337467e-06, |
|
"loss": 1.0109, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.4289855072463769, |
|
"grad_norm": 0.7851223349571228, |
|
"learning_rate": 1.8235715709763285e-06, |
|
"loss": 0.9404, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 1.4318840579710144, |
|
"grad_norm": 0.7435230612754822, |
|
"learning_rate": 1.8057659717401948e-06, |
|
"loss": 1.0388, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.434782608695652, |
|
"grad_norm": 0.795467734336853, |
|
"learning_rate": 1.7880285510727197e-06, |
|
"loss": 1.0, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.4376811594202898, |
|
"grad_norm": 0.8847975730895996, |
|
"learning_rate": 1.7703596875660645e-06, |
|
"loss": 1.0182, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.4405797101449276, |
|
"grad_norm": 1.0256052017211914, |
|
"learning_rate": 1.7527597583490825e-06, |
|
"loss": 0.9573, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 1.4434782608695653, |
|
"grad_norm": 0.7743212580680847, |
|
"learning_rate": 1.7352291390792798e-06, |
|
"loss": 0.9831, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.4463768115942028, |
|
"grad_norm": 0.9608955979347229, |
|
"learning_rate": 1.7177682039347875e-06, |
|
"loss": 0.9683, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.4492753623188406, |
|
"grad_norm": 0.899786651134491, |
|
"learning_rate": 1.7003773256063882e-06, |
|
"loss": 1.0373, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 1.4521739130434783, |
|
"grad_norm": 0.933459997177124, |
|
"learning_rate": 1.6830568752895455e-06, |
|
"loss": 1.0065, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.4550724637681158, |
|
"grad_norm": 0.7607547640800476, |
|
"learning_rate": 1.6658072226764949e-06, |
|
"loss": 0.9652, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.4579710144927536, |
|
"grad_norm": 0.7857306599617004, |
|
"learning_rate": 1.6486287359483422e-06, |
|
"loss": 0.9943, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 1.4608695652173913, |
|
"grad_norm": 0.9342886209487915, |
|
"learning_rate": 1.6315217817672142e-06, |
|
"loss": 1.028, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.463768115942029, |
|
"grad_norm": 1.0333482027053833, |
|
"learning_rate": 1.614486725268426e-06, |
|
"loss": 0.9296, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 1.4666666666666668, |
|
"grad_norm": 0.7788994908332825, |
|
"learning_rate": 1.5975239300526924e-06, |
|
"loss": 0.9871, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.4695652173913043, |
|
"grad_norm": 0.764268159866333, |
|
"learning_rate": 1.5806337581783593e-06, |
|
"loss": 0.9603, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 1.472463768115942, |
|
"grad_norm": 0.9053126573562622, |
|
"learning_rate": 1.5638165701536866e-06, |
|
"loss": 1.003, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.4753623188405798, |
|
"grad_norm": 0.890696108341217, |
|
"learning_rate": 1.5470727249291423e-06, |
|
"loss": 0.9894, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.4782608695652173, |
|
"grad_norm": 0.755885124206543, |
|
"learning_rate": 1.5304025798897521e-06, |
|
"loss": 0.9355, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.481159420289855, |
|
"grad_norm": 0.8839924931526184, |
|
"learning_rate": 1.5138064908474603e-06, |
|
"loss": 0.9879, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 1.4840579710144928, |
|
"grad_norm": 0.919336199760437, |
|
"learning_rate": 1.4972848120335453e-06, |
|
"loss": 1.042, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.4869565217391305, |
|
"grad_norm": 1.0073022842407227, |
|
"learning_rate": 1.4808378960910502e-06, |
|
"loss": 1.0537, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.4898550724637682, |
|
"grad_norm": 0.9994317293167114, |
|
"learning_rate": 1.4644660940672628e-06, |
|
"loss": 1.042, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.4927536231884058, |
|
"grad_norm": 0.8237168788909912, |
|
"learning_rate": 1.448169755406218e-06, |
|
"loss": 0.9449, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 1.4956521739130435, |
|
"grad_norm": 0.8838447332382202, |
|
"learning_rate": 1.4319492279412388e-06, |
|
"loss": 0.9789, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.4956521739130435, |
|
"eval_loss": 0.9736447334289551, |
|
"eval_runtime": 46.3906, |
|
"eval_samples_per_second": 5.518, |
|
"eval_steps_per_second": 0.69, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.4985507246376812, |
|
"grad_norm": 0.7661985754966736, |
|
"learning_rate": 1.4158048578875211e-06, |
|
"loss": 0.9991, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 1.5014492753623188, |
|
"grad_norm": 0.8049348592758179, |
|
"learning_rate": 1.399736989834728e-06, |
|
"loss": 0.9455, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.5043478260869565, |
|
"grad_norm": 0.8575480580329895, |
|
"learning_rate": 1.383745966739652e-06, |
|
"loss": 0.9764, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.5072463768115942, |
|
"grad_norm": 0.7336897253990173, |
|
"learning_rate": 1.3678321299188802e-06, |
|
"loss": 0.9613, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.5101449275362318, |
|
"grad_norm": 0.8718299865722656, |
|
"learning_rate": 1.351995819041521e-06, |
|
"loss": 0.9923, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 1.5130434782608697, |
|
"grad_norm": 0.9166209101676941, |
|
"learning_rate": 1.336237372121944e-06, |
|
"loss": 1.069, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.5159420289855072, |
|
"grad_norm": 0.9382581114768982, |
|
"learning_rate": 1.320557125512575e-06, |
|
"loss": 0.9671, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.518840579710145, |
|
"grad_norm": 0.8037452101707458, |
|
"learning_rate": 1.3049554138967052e-06, |
|
"loss": 0.9395, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.5217391304347827, |
|
"grad_norm": 0.6627395749092102, |
|
"learning_rate": 1.289432570281361e-06, |
|
"loss": 0.9025, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.5246376811594202, |
|
"grad_norm": 0.7865214943885803, |
|
"learning_rate": 1.2739889259901866e-06, |
|
"loss": 0.9021, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.527536231884058, |
|
"grad_norm": 0.8900570273399353, |
|
"learning_rate": 1.258624810656376e-06, |
|
"loss": 0.946, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 1.5304347826086957, |
|
"grad_norm": 0.8942597508430481, |
|
"learning_rate": 1.2433405522156334e-06, |
|
"loss": 1.0141, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.5333333333333332, |
|
"grad_norm": 0.8667037487030029, |
|
"learning_rate": 1.2281364768991804e-06, |
|
"loss": 1.0092, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.5362318840579712, |
|
"grad_norm": 0.7895119190216064, |
|
"learning_rate": 1.213012909226786e-06, |
|
"loss": 0.9251, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.5391304347826087, |
|
"grad_norm": 0.8225801587104797, |
|
"learning_rate": 1.1979701719998454e-06, |
|
"loss": 0.9449, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.5420289855072464, |
|
"grad_norm": 0.8342156410217285, |
|
"learning_rate": 1.1830085862944851e-06, |
|
"loss": 0.9676, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.5449275362318842, |
|
"grad_norm": 0.7941964864730835, |
|
"learning_rate": 1.1681284714547147e-06, |
|
"loss": 0.9907, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 1.5478260869565217, |
|
"grad_norm": 0.9655299782752991, |
|
"learning_rate": 1.1533301450856054e-06, |
|
"loss": 1.0126, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.5507246376811594, |
|
"grad_norm": 0.8632703423500061, |
|
"learning_rate": 1.1386139230465176e-06, |
|
"loss": 0.9452, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 1.5536231884057972, |
|
"grad_norm": 0.8908371329307556, |
|
"learning_rate": 1.1239801194443507e-06, |
|
"loss": 0.9821, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 1.5565217391304347, |
|
"grad_norm": 0.873409628868103, |
|
"learning_rate": 1.1094290466268493e-06, |
|
"loss": 0.969, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 1.5594202898550724, |
|
"grad_norm": 0.8888543844223022, |
|
"learning_rate": 1.0949610151759233e-06, |
|
"loss": 0.9593, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.5623188405797102, |
|
"grad_norm": 0.7646573781967163, |
|
"learning_rate": 1.0805763339010329e-06, |
|
"loss": 0.9287, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.5652173913043477, |
|
"grad_norm": 0.835421085357666, |
|
"learning_rate": 1.066275309832584e-06, |
|
"loss": 0.9732, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.5681159420289856, |
|
"grad_norm": 0.9228112697601318, |
|
"learning_rate": 1.0520582482153874e-06, |
|
"loss": 0.9675, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 1.5710144927536231, |
|
"grad_norm": 0.7750451564788818, |
|
"learning_rate": 1.037925452502131e-06, |
|
"loss": 0.9938, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.5739130434782609, |
|
"grad_norm": 0.8366883397102356, |
|
"learning_rate": 1.0238772243469153e-06, |
|
"loss": 0.962, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.5768115942028986, |
|
"grad_norm": 0.933855414390564, |
|
"learning_rate": 1.0099138635988026e-06, |
|
"loss": 0.9732, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5797101449275361, |
|
"grad_norm": 0.9288073778152466, |
|
"learning_rate": 9.960356682954293e-07, |
|
"loss": 0.9958, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 1.5826086956521739, |
|
"grad_norm": 0.7197360992431641, |
|
"learning_rate": 9.822429346566314e-07, |
|
"loss": 0.9266, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.5855072463768116, |
|
"grad_norm": 0.8900216817855835, |
|
"learning_rate": 9.685359570781344e-07, |
|
"loss": 1.0006, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 1.5884057971014491, |
|
"grad_norm": 0.7970424294471741, |
|
"learning_rate": 9.549150281252633e-07, |
|
"loss": 0.968, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 1.591304347826087, |
|
"grad_norm": 0.9357386231422424, |
|
"learning_rate": 9.41380438526694e-07, |
|
"loss": 1.0361, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.5942028985507246, |
|
"grad_norm": 0.740880012512207, |
|
"learning_rate": 9.279324771682586e-07, |
|
"loss": 0.9492, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.5971014492753624, |
|
"grad_norm": 0.9611430764198303, |
|
"learning_rate": 9.145714310867676e-07, |
|
"loss": 0.9559, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.9163907170295715, |
|
"learning_rate": 9.01297585463895e-07, |
|
"loss": 1.0112, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 1.6028985507246376, |
|
"grad_norm": 0.9926815032958984, |
|
"learning_rate": 8.881112236200795e-07, |
|
"loss": 1.0813, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 1.6057971014492753, |
|
"grad_norm": 0.8820666074752808, |
|
"learning_rate": 8.750126270084891e-07, |
|
"loss": 0.9911, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.608695652173913, |
|
"grad_norm": 0.817694365978241, |
|
"learning_rate": 8.620020752090008e-07, |
|
"loss": 0.9162, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 1.6115942028985506, |
|
"grad_norm": 0.9005435109138489, |
|
"learning_rate": 8.490798459222477e-07, |
|
"loss": 1.015, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.6144927536231886, |
|
"grad_norm": 0.8248128890991211, |
|
"learning_rate": 8.362462149636757e-07, |
|
"loss": 0.9976, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 1.617391304347826, |
|
"grad_norm": 0.8286884427070618, |
|
"learning_rate": 8.235014562576732e-07, |
|
"loss": 0.992, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.6202898550724638, |
|
"grad_norm": 0.8723387718200684, |
|
"learning_rate": 8.108458418317089e-07, |
|
"loss": 0.9381, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.6231884057971016, |
|
"grad_norm": 0.9833754897117615, |
|
"learning_rate": 7.98279641810537e-07, |
|
"loss": 0.9435, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.626086956521739, |
|
"grad_norm": 0.9212725162506104, |
|
"learning_rate": 7.858031244104247e-07, |
|
"loss": 0.9611, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 1.6289855072463768, |
|
"grad_norm": 0.852350115776062, |
|
"learning_rate": 7.734165559334327e-07, |
|
"loss": 0.9064, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.6318840579710145, |
|
"grad_norm": 0.8955137729644775, |
|
"learning_rate": 7.611202007617241e-07, |
|
"loss": 0.9547, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 1.634782608695652, |
|
"grad_norm": 0.8889902830123901, |
|
"learning_rate": 7.489143213519301e-07, |
|
"loss": 0.9533, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.6376811594202898, |
|
"grad_norm": 0.9037710428237915, |
|
"learning_rate": 7.367991782295392e-07, |
|
"loss": 0.9213, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 1.6405797101449275, |
|
"grad_norm": 0.8594886064529419, |
|
"learning_rate": 7.24775029983345e-07, |
|
"loss": 0.9765, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.643478260869565, |
|
"grad_norm": 0.7082343101501465, |
|
"learning_rate": 7.128421332599189e-07, |
|
"loss": 0.9871, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 1.646376811594203, |
|
"grad_norm": 0.878217339515686, |
|
"learning_rate": 7.010007427581378e-07, |
|
"loss": 0.9366, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.6492753623188405, |
|
"grad_norm": 0.9462459087371826, |
|
"learning_rate": 6.892511112237472e-07, |
|
"loss": 0.9505, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.6521739130434783, |
|
"grad_norm": 0.7900387644767761, |
|
"learning_rate": 6.775934894439606e-07, |
|
"loss": 0.9554, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.655072463768116, |
|
"grad_norm": 0.8542242050170898, |
|
"learning_rate": 6.66028126242117e-07, |
|
"loss": 0.9331, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 1.6579710144927535, |
|
"grad_norm": 0.9795560836791992, |
|
"learning_rate": 6.545552684723583e-07, |
|
"loss": 0.9203, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.6608695652173913, |
|
"grad_norm": 0.7833444476127625, |
|
"learning_rate": 6.431751610143716e-07, |
|
"loss": 0.9977, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 1.663768115942029, |
|
"grad_norm": 0.8404137492179871, |
|
"learning_rate": 6.318880467681527e-07, |
|
"loss": 0.9981, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.9158584475517273, |
|
"learning_rate": 6.206941666488287e-07, |
|
"loss": 0.9584, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 1.6695652173913045, |
|
"grad_norm": 0.7720228433609009, |
|
"learning_rate": 6.095937595815104e-07, |
|
"loss": 0.9284, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.672463768115942, |
|
"grad_norm": 0.9077423214912415, |
|
"learning_rate": 5.985870624961993e-07, |
|
"loss": 1.0104, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 1.6753623188405797, |
|
"grad_norm": 0.7142834663391113, |
|
"learning_rate": 5.876743103227217e-07, |
|
"loss": 0.9617, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.6782608695652175, |
|
"grad_norm": 0.9244917035102844, |
|
"learning_rate": 5.768557359857241e-07, |
|
"loss": 0.9534, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.681159420289855, |
|
"grad_norm": 0.8961134552955627, |
|
"learning_rate": 5.661315703996905e-07, |
|
"loss": 0.9462, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.6840579710144927, |
|
"grad_norm": 0.9584707021713257, |
|
"learning_rate": 5.555020424640267e-07, |
|
"loss": 0.9483, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 1.6869565217391305, |
|
"grad_norm": 0.8094743490219116, |
|
"learning_rate": 5.449673790581611e-07, |
|
"loss": 0.9564, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.689855072463768, |
|
"grad_norm": 0.886703610420227, |
|
"learning_rate": 5.345278050367142e-07, |
|
"loss": 1.0153, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 1.692753623188406, |
|
"grad_norm": 0.9125918745994568, |
|
"learning_rate": 5.241835432246888e-07, |
|
"loss": 0.9749, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.6956521739130435, |
|
"grad_norm": 0.8972467184066772, |
|
"learning_rate": 5.139348144127237e-07, |
|
"loss": 1.0084, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 1.6985507246376812, |
|
"grad_norm": 0.7566870450973511, |
|
"learning_rate": 5.037818373523723e-07, |
|
"loss": 0.9932, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.701449275362319, |
|
"grad_norm": 0.8601511716842651, |
|
"learning_rate": 4.937248287514407e-07, |
|
"loss": 0.9747, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 1.7043478260869565, |
|
"grad_norm": 0.8272446393966675, |
|
"learning_rate": 4.837640032693558e-07, |
|
"loss": 1.0065, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 1.7072463768115942, |
|
"grad_norm": 0.7029653191566467, |
|
"learning_rate": 4.738995735125895e-07, |
|
"loss": 0.9384, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.710144927536232, |
|
"grad_norm": 0.913718044757843, |
|
"learning_rate": 4.641317500301173e-07, |
|
"loss": 0.9563, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.7130434782608694, |
|
"grad_norm": 0.9736040830612183, |
|
"learning_rate": 4.5446074130892525e-07, |
|
"loss": 0.9455, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 1.7159420289855074, |
|
"grad_norm": 0.8182763457298279, |
|
"learning_rate": 4.448867537695578e-07, |
|
"loss": 0.944, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.718840579710145, |
|
"grad_norm": 0.8536428213119507, |
|
"learning_rate": 4.3540999176171717e-07, |
|
"loss": 0.9029, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 1.7217391304347827, |
|
"grad_norm": 0.8713299036026001, |
|
"learning_rate": 4.2603065755989493e-07, |
|
"loss": 0.9448, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.7246376811594204, |
|
"grad_norm": 0.9857087135314941, |
|
"learning_rate": 4.167489513590611e-07, |
|
"loss": 1.0004, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 1.727536231884058, |
|
"grad_norm": 0.9195379018783569, |
|
"learning_rate": 4.0756507127038494e-07, |
|
"loss": 1.0247, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 1.7304347826086957, |
|
"grad_norm": 0.8422645926475525, |
|
"learning_rate": 3.984792133170129e-07, |
|
"loss": 1.0087, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 1.7333333333333334, |
|
"grad_norm": 0.8902682662010193, |
|
"learning_rate": 3.894915714298775e-07, |
|
"loss": 0.8793, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.736231884057971, |
|
"grad_norm": 0.8859000205993652, |
|
"learning_rate": 3.8060233744356634e-07, |
|
"loss": 1.0018, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"grad_norm": 0.8340051174163818, |
|
"learning_rate": 3.71811701092219e-07, |
|
"loss": 0.9534, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.7420289855072464, |
|
"grad_norm": 0.8677003979682922, |
|
"learning_rate": 3.6311985000548223e-07, |
|
"loss": 0.9525, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 1.744927536231884, |
|
"grad_norm": 0.932613730430603, |
|
"learning_rate": 3.5452696970450674e-07, |
|
"loss": 0.9257, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.7478260869565219, |
|
"grad_norm": 0.9657606482505798, |
|
"learning_rate": 3.4603324359798016e-07, |
|
"loss": 1.0033, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 1.7478260869565219, |
|
"eval_loss": 0.9723503589630127, |
|
"eval_runtime": 46.2237, |
|
"eval_samples_per_second": 5.538, |
|
"eval_steps_per_second": 0.692, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 1.7507246376811594, |
|
"grad_norm": 0.860346257686615, |
|
"learning_rate": 3.3763885297822153e-07, |
|
"loss": 0.986, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.7536231884057971, |
|
"grad_norm": 0.8614711165428162, |
|
"learning_rate": 3.293439770173046e-07, |
|
"loss": 0.9976, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 1.7565217391304349, |
|
"grad_norm": 0.7311533689498901, |
|
"learning_rate": 3.2114879276323783e-07, |
|
"loss": 0.908, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.7594202898550724, |
|
"grad_norm": 0.9412534236907959, |
|
"learning_rate": 3.130534751361808e-07, |
|
"loss": 0.977, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 1.76231884057971, |
|
"grad_norm": 0.911098062992096, |
|
"learning_rate": 3.0505819692471797e-07, |
|
"loss": 0.9387, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 1.7652173913043478, |
|
"grad_norm": 0.8363705277442932, |
|
"learning_rate": 2.9716312878216194e-07, |
|
"loss": 0.9538, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.7681159420289854, |
|
"grad_norm": 0.9569475650787354, |
|
"learning_rate": 2.893684392229185e-07, |
|
"loss": 0.998, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 1.7710144927536233, |
|
"grad_norm": 0.8830727338790894, |
|
"learning_rate": 2.8167429461888496e-07, |
|
"loss": 0.9277, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 1.7739130434782608, |
|
"grad_norm": 0.9968934059143066, |
|
"learning_rate": 2.7408085919590265e-07, |
|
"loss": 1.0167, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 1.7768115942028986, |
|
"grad_norm": 0.7348361611366272, |
|
"learning_rate": 2.6658829503024566e-07, |
|
"loss": 0.9224, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 1.7797101449275363, |
|
"grad_norm": 0.9676991701126099, |
|
"learning_rate": 2.5919676204517073e-07, |
|
"loss": 0.9808, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.7826086956521738, |
|
"grad_norm": 0.8737136125564575, |
|
"learning_rate": 2.5190641800749424e-07, |
|
"loss": 0.9436, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 1.7855072463768116, |
|
"grad_norm": 0.8523948192596436, |
|
"learning_rate": 2.447174185242324e-07, |
|
"loss": 0.952, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 1.7884057971014493, |
|
"grad_norm": 0.7342602610588074, |
|
"learning_rate": 2.3762991703927375e-07, |
|
"loss": 0.9682, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 1.7913043478260868, |
|
"grad_norm": 1.044270634651184, |
|
"learning_rate": 2.3064406483010947e-07, |
|
"loss": 0.9725, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.7942028985507248, |
|
"grad_norm": 0.9236974120140076, |
|
"learning_rate": 2.237600110046001e-07, |
|
"loss": 0.951, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.7971014492753623, |
|
"grad_norm": 0.7988727688789368, |
|
"learning_rate": 2.1697790249779638e-07, |
|
"loss": 0.8851, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.7906875014305115, |
|
"learning_rate": 2.102978840687997e-07, |
|
"loss": 0.9162, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 1.8028985507246378, |
|
"grad_norm": 0.7702775001525879, |
|
"learning_rate": 2.0372009829767558e-07, |
|
"loss": 0.9614, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.8057971014492753, |
|
"grad_norm": 0.9317652583122253, |
|
"learning_rate": 1.9724468558240838e-07, |
|
"loss": 0.9105, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 1.808695652173913, |
|
"grad_norm": 0.855368435382843, |
|
"learning_rate": 1.908717841359048e-07, |
|
"loss": 1.0019, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.8115942028985508, |
|
"grad_norm": 0.761951744556427, |
|
"learning_rate": 1.8460152998304393e-07, |
|
"loss": 0.9267, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 1.8144927536231883, |
|
"grad_norm": 0.8468912839889526, |
|
"learning_rate": 1.7843405695777582e-07, |
|
"loss": 1.0065, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.8173913043478263, |
|
"grad_norm": 0.889159619808197, |
|
"learning_rate": 1.7236949670026037e-07, |
|
"loss": 0.9332, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 1.8202898550724638, |
|
"grad_norm": 0.8339653015136719, |
|
"learning_rate": 1.664079786540629e-07, |
|
"loss": 0.9851, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 1.8231884057971013, |
|
"grad_norm": 0.7670577764511108, |
|
"learning_rate": 1.6054963006338742e-07, |
|
"loss": 0.9354, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.8260869565217392, |
|
"grad_norm": 0.8923590183258057, |
|
"learning_rate": 1.547945759703623e-07, |
|
"loss": 1.0162, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.8289855072463768, |
|
"grad_norm": 0.7903847098350525, |
|
"learning_rate": 1.491429392123711e-07, |
|
"loss": 0.979, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 1.8318840579710145, |
|
"grad_norm": 0.9351047873497009, |
|
"learning_rate": 1.435948404194304e-07, |
|
"loss": 0.9458, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 1.8347826086956522, |
|
"grad_norm": 0.8081286549568176, |
|
"learning_rate": 1.3815039801161723e-07, |
|
"loss": 0.9246, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 1.8376811594202898, |
|
"grad_norm": 0.752216100692749, |
|
"learning_rate": 1.328097281965357e-07, |
|
"loss": 0.9758, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.8405797101449275, |
|
"grad_norm": 0.9659929871559143, |
|
"learning_rate": 1.2757294496684447e-07, |
|
"loss": 1.0107, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 1.8434782608695652, |
|
"grad_norm": 1.0376217365264893, |
|
"learning_rate": 1.22440160097817e-07, |
|
"loss": 0.9631, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 1.8463768115942027, |
|
"grad_norm": 0.9361832141876221, |
|
"learning_rate": 1.1741148314495965e-07, |
|
"loss": 0.9867, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 1.8492753623188407, |
|
"grad_norm": 0.8664498329162598, |
|
"learning_rate": 1.1248702144167123e-07, |
|
"loss": 0.9703, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.8521739130434782, |
|
"grad_norm": 0.9653159379959106, |
|
"learning_rate": 1.0766688009695548e-07, |
|
"loss": 0.9662, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.855072463768116, |
|
"grad_norm": 1.0553069114685059, |
|
"learning_rate": 1.0295116199317057e-07, |
|
"loss": 0.9745, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.8579710144927537, |
|
"grad_norm": 0.9453853964805603, |
|
"learning_rate": 9.833996778384259e-08, |
|
"loss": 0.9802, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 1.8608695652173912, |
|
"grad_norm": 0.7949392795562744, |
|
"learning_rate": 9.383339589150776e-08, |
|
"loss": 0.9173, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.863768115942029, |
|
"grad_norm": 0.7941511273384094, |
|
"learning_rate": 8.943154250562025e-08, |
|
"loss": 0.9633, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 1.8666666666666667, |
|
"grad_norm": 0.8360518217086792, |
|
"learning_rate": 8.513450158049109e-08, |
|
"loss": 0.9565, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.8695652173913042, |
|
"grad_norm": 0.9996237754821777, |
|
"learning_rate": 8.094236483329022e-08, |
|
"loss": 0.9999, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 1.8724637681159422, |
|
"grad_norm": 0.7493065595626831, |
|
"learning_rate": 7.685522174208205e-08, |
|
"loss": 0.9733, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.8753623188405797, |
|
"grad_norm": 0.8603729605674744, |
|
"learning_rate": 7.287315954392137e-08, |
|
"loss": 0.9624, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 1.8782608695652174, |
|
"grad_norm": 0.7145766615867615, |
|
"learning_rate": 6.899626323298714e-08, |
|
"loss": 1.0049, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 1.8811594202898552, |
|
"grad_norm": 0.9684036374092102, |
|
"learning_rate": 6.522461555877213e-08, |
|
"loss": 0.9562, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.8840579710144927, |
|
"grad_norm": 0.8989734053611755, |
|
"learning_rate": 6.15582970243117e-08, |
|
"loss": 1.0268, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.8869565217391304, |
|
"grad_norm": 0.9243214726448059, |
|
"learning_rate": 5.799738588447068e-08, |
|
"loss": 0.9643, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 1.8898550724637682, |
|
"grad_norm": 0.9879785776138306, |
|
"learning_rate": 5.454195814427021e-08, |
|
"loss": 0.9417, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 1.8927536231884057, |
|
"grad_norm": 0.9754204154014587, |
|
"learning_rate": 5.119208755726579e-08, |
|
"loss": 1.063, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 1.8956521739130436, |
|
"grad_norm": 0.7662235498428345, |
|
"learning_rate": 4.794784562397459e-08, |
|
"loss": 0.9799, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.8985507246376812, |
|
"grad_norm": 0.8312128782272339, |
|
"learning_rate": 4.4809301590345576e-08, |
|
"loss": 0.9671, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 1.901449275362319, |
|
"grad_norm": 0.8354112505912781, |
|
"learning_rate": 4.177652244628627e-08, |
|
"loss": 0.9688, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 1.9043478260869566, |
|
"grad_norm": 0.9401686191558838, |
|
"learning_rate": 3.884957292422997e-08, |
|
"loss": 0.9989, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 1.9072463768115941, |
|
"grad_norm": 0.8864877820014954, |
|
"learning_rate": 3.602851549775521e-08, |
|
"loss": 1.0094, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.9101449275362319, |
|
"grad_norm": 0.9440781474113464, |
|
"learning_rate": 3.3313410380250157e-08, |
|
"loss": 0.9544, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.9130434782608696, |
|
"grad_norm": 1.0098837614059448, |
|
"learning_rate": 3.0704315523631956e-08, |
|
"loss": 0.9209, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.9159420289855071, |
|
"grad_norm": 0.9735342860221863, |
|
"learning_rate": 2.8201286617103863e-08, |
|
"loss": 1.0385, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 1.9188405797101449, |
|
"grad_norm": 0.9122427105903625, |
|
"learning_rate": 2.5804377085972278e-08, |
|
"loss": 0.9844, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.9217391304347826, |
|
"grad_norm": 0.8491829633712769, |
|
"learning_rate": 2.351363809050211e-08, |
|
"loss": 1.0045, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 1.9246376811594201, |
|
"grad_norm": 0.83339524269104, |
|
"learning_rate": 2.1329118524827662e-08, |
|
"loss": 0.9844, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.927536231884058, |
|
"grad_norm": 0.9295774102210999, |
|
"learning_rate": 1.9250865015906784e-08, |
|
"loss": 1.0247, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 1.9304347826086956, |
|
"grad_norm": 0.8484298586845398, |
|
"learning_rate": 1.7278921922527224e-08, |
|
"loss": 1.0195, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.9333333333333333, |
|
"grad_norm": 0.8862564563751221, |
|
"learning_rate": 1.541333133436018e-08, |
|
"loss": 0.9827, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 1.936231884057971, |
|
"grad_norm": 0.8401779532432556, |
|
"learning_rate": 1.3654133071059894e-08, |
|
"loss": 1.0295, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 1.9391304347826086, |
|
"grad_norm": 0.8818807005882263, |
|
"learning_rate": 1.200136468141544e-08, |
|
"loss": 0.9554, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.9420289855072463, |
|
"grad_norm": 0.8366807699203491, |
|
"learning_rate": 1.0455061442548597e-08, |
|
"loss": 0.9771, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.944927536231884, |
|
"grad_norm": 0.8115973472595215, |
|
"learning_rate": 9.015256359161118e-09, |
|
"loss": 1.0364, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 1.9478260869565216, |
|
"grad_norm": 0.925413191318512, |
|
"learning_rate": 7.681980162830283e-09, |
|
"loss": 1.0026, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 1.9507246376811596, |
|
"grad_norm": 0.8799839615821838, |
|
"learning_rate": 6.455261311352767e-09, |
|
"loss": 1.0164, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 1.953623188405797, |
|
"grad_norm": 0.8579555153846741, |
|
"learning_rate": 5.3351259881379016e-09, |
|
"loss": 0.9775, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.9565217391304348, |
|
"grad_norm": 0.8572901487350464, |
|
"learning_rate": 4.321598101647007e-09, |
|
"loss": 0.9926, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 1.9594202898550726, |
|
"grad_norm": 0.7731289863586426, |
|
"learning_rate": 3.41469928488547e-09, |
|
"loss": 1.0126, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 1.96231884057971, |
|
"grad_norm": 0.937656581401825, |
|
"learning_rate": 2.6144488949392253e-09, |
|
"loss": 0.9443, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 1.9652173913043478, |
|
"grad_norm": 0.8993798494338989, |
|
"learning_rate": 1.9208640125628618e-09, |
|
"loss": 0.946, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.9681159420289855, |
|
"grad_norm": 0.9831903576850891, |
|
"learning_rate": 1.3339594418138036e-09, |
|
"loss": 0.9799, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.971014492753623, |
|
"grad_norm": 0.9224021434783936, |
|
"learning_rate": 8.537477097364522e-10, |
|
"loss": 0.9299, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 1.973913043478261, |
|
"grad_norm": 0.8220890760421753, |
|
"learning_rate": 4.802390660968437e-10, |
|
"loss": 1.0307, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 1.9768115942028985, |
|
"grad_norm": 1.0893397331237793, |
|
"learning_rate": 2.1344148316060352e-10, |
|
"loss": 0.9523, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.9797101449275363, |
|
"grad_norm": 0.8536267280578613, |
|
"learning_rate": 5.336065552641323e-11, |
|
"loss": 0.9675, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 1.982608695652174, |
|
"grad_norm": 0.8123190999031067, |
|
"learning_rate": 0.0, |
|
"loss": 0.9576, |
|
"step": 690 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 690, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 173, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.816855525560156e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|