|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.119799451652303, |
|
"eval_steps": 100, |
|
"global_step": 1024, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010932788971549125, |
|
"grad_norm": 167.07713317871094, |
|
"learning_rate": 0.0005999985601583006, |
|
"loss": 9.6698, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02186557794309825, |
|
"grad_norm": 83.92709350585938, |
|
"learning_rate": 0.0005998257958771109, |
|
"loss": 8.2484, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.032798366914647374, |
|
"grad_norm": 127.91200256347656, |
|
"learning_rate": 0.0005993652532642609, |
|
"loss": 7.6452, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0437311558861965, |
|
"grad_norm": 97.33670043945312, |
|
"learning_rate": 0.0005986173743570491, |
|
"loss": 7.4548, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.054663944857745624, |
|
"grad_norm": 127.0005874633789, |
|
"learning_rate": 0.0005975828769834513, |
|
"loss": 7.3226, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06559673382929475, |
|
"grad_norm": 104.47633361816406, |
|
"learning_rate": 0.0005962627540731365, |
|
"loss": 7.204, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07652952280084388, |
|
"grad_norm": 164.4477081298828, |
|
"learning_rate": 0.0005946582727044349, |
|
"loss": 7.1105, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.087462311772393, |
|
"grad_norm": 126.8350601196289, |
|
"learning_rate": 0.0005927709728881719, |
|
"loss": 7.0511, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09839510074394213, |
|
"grad_norm": 158.55856323242188, |
|
"learning_rate": 0.0005906026660895383, |
|
"loss": 7.0642, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10932788971549125, |
|
"grad_norm": 126.1555404663086, |
|
"learning_rate": 0.0005881554334894116, |
|
"loss": 7.031, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10932788971549125, |
|
"eval_loss": 7.01555061340332, |
|
"eval_runtime": 79.0984, |
|
"eval_samples_per_second": 118.409, |
|
"eval_steps_per_second": 14.804, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12026067868704038, |
|
"grad_norm": 108.58393096923828, |
|
"learning_rate": 0.0005854316239868012, |
|
"loss": 7.0123, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1311934676585895, |
|
"grad_norm": 178.0326690673828, |
|
"learning_rate": 0.0005824338519443309, |
|
"loss": 6.9897, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.14212625663013861, |
|
"grad_norm": 192.8655242919922, |
|
"learning_rate": 0.0005791649946789259, |
|
"loss": 7.0117, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.15305904560168776, |
|
"grad_norm": 143.3759002685547, |
|
"learning_rate": 0.0005756281897001107, |
|
"loss": 7.0073, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.16399183457323688, |
|
"grad_norm": 171.0679168701172, |
|
"learning_rate": 0.0005718268316985698, |
|
"loss": 6.9843, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.174924623544786, |
|
"grad_norm": 164.86534118652344, |
|
"learning_rate": 0.0005677645692878606, |
|
"loss": 7.0083, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1858574125163351, |
|
"grad_norm": 125.85225677490234, |
|
"learning_rate": 0.000563445301502407, |
|
"loss": 7.02, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.19679020148788426, |
|
"grad_norm": 144.15589904785156, |
|
"learning_rate": 0.0005588731740551344, |
|
"loss": 6.9773, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.20772299045943338, |
|
"grad_norm": 108.05564880371094, |
|
"learning_rate": 0.0005540525753583378, |
|
"loss": 6.9632, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2186557794309825, |
|
"grad_norm": 146.53924560546875, |
|
"learning_rate": 0.0005489881323116018, |
|
"loss": 6.929, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2186557794309825, |
|
"eval_loss": 6.925621509552002, |
|
"eval_runtime": 78.9467, |
|
"eval_samples_per_second": 118.637, |
|
"eval_steps_per_second": 14.833, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2295885684025316, |
|
"grad_norm": 204.57968139648438, |
|
"learning_rate": 0.0005436847058608189, |
|
"loss": 6.9631, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.24052135737408076, |
|
"grad_norm": 171.31556701660156, |
|
"learning_rate": 0.0005381473863325621, |
|
"loss": 7.0389, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.25145414634562985, |
|
"grad_norm": 142.57449340820312, |
|
"learning_rate": 0.0005323814885482963, |
|
"loss": 6.967, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.262386935317179, |
|
"grad_norm": 119.19646453857422, |
|
"learning_rate": 0.000526392546723115, |
|
"loss": 6.9456, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.27331972428872814, |
|
"grad_norm": 153.62359619140625, |
|
"learning_rate": 0.0005201863091538979, |
|
"loss": 6.9686, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.28425251326027723, |
|
"grad_norm": 150.35699462890625, |
|
"learning_rate": 0.000513768732701989, |
|
"loss": 6.9846, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2951853022318264, |
|
"grad_norm": 215.55368041992188, |
|
"learning_rate": 0.0005071459770756929, |
|
"loss": 6.9968, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3061180912033755, |
|
"grad_norm": 107.55154418945312, |
|
"learning_rate": 0.0005003243989180711, |
|
"loss": 7.0033, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3170508801749246, |
|
"grad_norm": 190.4154052734375, |
|
"learning_rate": 0.0004933105457057203, |
|
"loss": 6.9816, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.32798366914647376, |
|
"grad_norm": 159.7703094482422, |
|
"learning_rate": 0.0004861111494643821, |
|
"loss": 7.0486, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.32798366914647376, |
|
"eval_loss": 7.4869384765625, |
|
"eval_runtime": 79.1717, |
|
"eval_samples_per_second": 118.3, |
|
"eval_steps_per_second": 14.791, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3389164581180229, |
|
"grad_norm": 218.22604370117188, |
|
"learning_rate": 0.0004794787611927562, |
|
"loss": 7.2679, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.349849247089572, |
|
"grad_norm": 182.51431274414062, |
|
"learning_rate": 0.0004719460124060748, |
|
"loss": 7.1809, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.36078203606112114, |
|
"grad_norm": 137.0953826904297, |
|
"learning_rate": 0.0004642482266637136, |
|
"loss": 7.0417, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.3717148250326702, |
|
"grad_norm": 92.07840728759766, |
|
"learning_rate": 0.0004563927924424775, |
|
"loss": 6.9309, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.38264761400421937, |
|
"grad_norm": 147.35975646972656, |
|
"learning_rate": 0.00044838724953309093, |
|
"loss": 6.8844, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3935804029757685, |
|
"grad_norm": 262.996337890625, |
|
"learning_rate": 0.0004402392818033671, |
|
"loss": 6.966, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4045131919473176, |
|
"grad_norm": 155.3452606201172, |
|
"learning_rate": 0.00043195670982308984, |
|
"loss": 7.0715, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.41544598091886675, |
|
"grad_norm": 129.5069580078125, |
|
"learning_rate": 0.00042354748335768664, |
|
"loss": 7.0806, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4263787698904159, |
|
"grad_norm": 92.96502685546875, |
|
"learning_rate": 0.0004150196737378971, |
|
"loss": 6.9999, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.437311558861965, |
|
"grad_norm": 120.41193389892578, |
|
"learning_rate": 0.0004063814661127606, |
|
"loss": 6.9339, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.437311558861965, |
|
"eval_loss": 6.931961536407471, |
|
"eval_runtime": 78.8373, |
|
"eval_samples_per_second": 118.802, |
|
"eval_steps_per_second": 14.853, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.44824434783351413, |
|
"grad_norm": 188.7049560546875, |
|
"learning_rate": 0.00039764115159335935, |
|
"loss": 6.9242, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4591771368050632, |
|
"grad_norm": 131.7518768310547, |
|
"learning_rate": 0.0003888071192948565, |
|
"loss": 6.9815, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.47010992577661237, |
|
"grad_norm": 247.91549682617188, |
|
"learning_rate": 0.0003798878482844695, |
|
"loss": 7.0838, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4810427147481615, |
|
"grad_norm": 135.4517364501953, |
|
"learning_rate": 0.000370891899443104, |
|
"loss": 7.1813, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4919755037197106, |
|
"grad_norm": 99.5172119140625, |
|
"learning_rate": 0.00036182790724846315, |
|
"loss": 7.1557, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5029082926912597, |
|
"grad_norm": 165.1914825439453, |
|
"learning_rate": 0.00035270457148751575, |
|
"loss": 7.0382, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5138410816628088, |
|
"grad_norm": 128.59959411621094, |
|
"learning_rate": 0.00034353064890628107, |
|
"loss": 7.0597, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.524773870634358, |
|
"grad_norm": 142.37147521972656, |
|
"learning_rate": 0.00033431494480494175, |
|
"loss": 7.092, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5357066596059071, |
|
"grad_norm": 217.4059295654297, |
|
"learning_rate": 0.0003250663045863544, |
|
"loss": 7.0457, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5466394485774563, |
|
"grad_norm": 125.81988525390625, |
|
"learning_rate": 0.0003157936052660688, |
|
"loss": 7.0112, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5466394485774563, |
|
"eval_loss": 7.004736423492432, |
|
"eval_runtime": 78.8041, |
|
"eval_samples_per_second": 118.852, |
|
"eval_steps_per_second": 14.86, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5575722375490054, |
|
"grad_norm": 170.00523376464844, |
|
"learning_rate": 0.0003065057469520046, |
|
"loss": 7.0162, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5685050265205545, |
|
"grad_norm": 216.81466674804688, |
|
"learning_rate": 0.0002972116443019633, |
|
"loss": 7.0584, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5794378154921036, |
|
"grad_norm": 239.21087646484375, |
|
"learning_rate": 0.0002879202179671755, |
|
"loss": 7.1254, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5903706044636527, |
|
"grad_norm": 190.0070343017578, |
|
"learning_rate": 0.00027864038603009453, |
|
"loss": 7.1717, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6013033934352019, |
|
"grad_norm": 179.18785095214844, |
|
"learning_rate": 0.00026938105544465745, |
|
"loss": 7.1185, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.612236182406751, |
|
"grad_norm": 279.44781494140625, |
|
"learning_rate": 0.0002601511134872255, |
|
"loss": 7.0727, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6231689713783001, |
|
"grad_norm": 227.90072631835938, |
|
"learning_rate": 0.0002509594192264121, |
|
"loss": 7.1088, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6341017603498492, |
|
"grad_norm": 173.11819458007812, |
|
"learning_rate": 0.0002418147950199862, |
|
"loss": 7.0927, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6450345493213984, |
|
"grad_norm": 164.40736389160156, |
|
"learning_rate": 0.00023272601804700946, |
|
"loss": 7.0701, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6559673382929475, |
|
"grad_norm": 123.35533142089844, |
|
"learning_rate": 0.0002237018118833387, |
|
"loss": 7.0496, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6559673382929475, |
|
"eval_loss": 7.052866458892822, |
|
"eval_runtime": 78.8887, |
|
"eval_samples_per_second": 118.724, |
|
"eval_steps_per_second": 14.844, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6669001272644967, |
|
"grad_norm": 225.67015075683594, |
|
"learning_rate": 0.0002147508381285762, |
|
"loss": 7.04, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6778329162360458, |
|
"grad_norm": 140.2364501953125, |
|
"learning_rate": 0.00020588168809250687, |
|
"loss": 7.0902, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6887657052075948, |
|
"grad_norm": 262.8550720214844, |
|
"learning_rate": 0.00019710287454900033, |
|
"loss": 7.1224, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.699698494179144, |
|
"grad_norm": 150.97813415527344, |
|
"learning_rate": 0.00018842282356529402, |
|
"loss": 7.1802, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7106312831506931, |
|
"grad_norm": 452.73431396484375, |
|
"learning_rate": 0.00017984986641449754, |
|
"loss": 7.1497, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7215640721222423, |
|
"grad_norm": 138.37220764160156, |
|
"learning_rate": 0.00017139223157908368, |
|
"loss": 7.1715, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7324968610937914, |
|
"grad_norm": 144.21133422851562, |
|
"learning_rate": 0.00016305803685303906, |
|
"loss": 7.1458, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7434296500653405, |
|
"grad_norm": 142.4859161376953, |
|
"learning_rate": 0.00015485528155025473, |
|
"loss": 7.1041, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7543624390368896, |
|
"grad_norm": 190.189208984375, |
|
"learning_rate": 0.00014679183882663872, |
|
"loss": 7.0798, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7652952280084387, |
|
"grad_norm": 160.14442443847656, |
|
"learning_rate": 0.0001388754481233139, |
|
"loss": 7.074, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7652952280084387, |
|
"eval_loss": 7.0790934562683105, |
|
"eval_runtime": 79.0053, |
|
"eval_samples_per_second": 118.549, |
|
"eval_steps_per_second": 14.822, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7762280169799879, |
|
"grad_norm": 173.01499938964844, |
|
"learning_rate": 0.0001311137077381614, |
|
"loss": 7.0821, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.787160805951537, |
|
"grad_norm": 156.1138458251953, |
|
"learning_rate": 0.00012351406753283216, |
|
"loss": 7.0838, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7980935949230861, |
|
"grad_norm": 161.9981689453125, |
|
"learning_rate": 0.00011681901904809884, |
|
"loss": 7.0639, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8090263838946352, |
|
"grad_norm": 174.0237579345703, |
|
"learning_rate": 0.00010954733067505213, |
|
"loss": 7.0604, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8199591728661844, |
|
"grad_norm": 141.823974609375, |
|
"learning_rate": 0.0001024584422885053, |
|
"loss": 7.0508, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8308919618377335, |
|
"grad_norm": 121.39106750488281, |
|
"learning_rate": 9.555915793434476e-05, |
|
"loss": 7.0568, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8418247508092827, |
|
"grad_norm": 178.37924194335938, |
|
"learning_rate": 8.885609967300851e-05, |
|
"loss": 7.0589, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.8527575397808318, |
|
"grad_norm": 304.8969421386719, |
|
"learning_rate": 8.235570122350937e-05, |
|
"loss": 7.0582, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8636903287523808, |
|
"grad_norm": 128.75843811035156, |
|
"learning_rate": 7.606420178823293e-05, |
|
"loss": 7.0622, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.87462311772393, |
|
"grad_norm": 88.88775634765625, |
|
"learning_rate": 6.998764006443615e-05, |
|
"loss": 7.0664, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.87462311772393, |
|
"eval_loss": 7.048069477081299, |
|
"eval_runtime": 78.7086, |
|
"eval_samples_per_second": 118.996, |
|
"eval_steps_per_second": 14.878, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8855559066954791, |
|
"grad_norm": 131.33584594726562, |
|
"learning_rate": 6.413184844819423e-05, |
|
"loss": 7.0381, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8964886956670283, |
|
"grad_norm": 176.8515625, |
|
"learning_rate": 6e-05, |
|
"loss": 7.0461, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9074214846385774, |
|
"grad_norm": 128.32069396972656, |
|
"learning_rate": 6e-05, |
|
"loss": 7.0597, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.9183542736101264, |
|
"grad_norm": 150.107421875, |
|
"learning_rate": 6e-05, |
|
"loss": 7.0582, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9292870625816756, |
|
"grad_norm": 174.95352172851562, |
|
"learning_rate": 6e-05, |
|
"loss": 7.0729, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9402198515532247, |
|
"grad_norm": 209.878173828125, |
|
"learning_rate": 6e-05, |
|
"loss": 7.0949, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9511526405247739, |
|
"grad_norm": 181.1326904296875, |
|
"learning_rate": 6e-05, |
|
"loss": 7.109, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.962085429496323, |
|
"grad_norm": 197.11639404296875, |
|
"learning_rate": 6e-05, |
|
"loss": 7.1132, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.9730182184678722, |
|
"grad_norm": 197.16473388671875, |
|
"learning_rate": 6e-05, |
|
"loss": 7.1008, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.9839510074394212, |
|
"grad_norm": 224.1211395263672, |
|
"learning_rate": 6e-05, |
|
"loss": 7.1024, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9839510074394212, |
|
"eval_loss": 7.119234561920166, |
|
"eval_runtime": 78.4745, |
|
"eval_samples_per_second": 119.351, |
|
"eval_steps_per_second": 14.922, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9948837964109704, |
|
"grad_norm": 161.86753845214844, |
|
"learning_rate": 6e-05, |
|
"loss": 7.1127, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.0060984463481923, |
|
"grad_norm": 247.6467742919922, |
|
"learning_rate": 6e-05, |
|
"loss": 7.1115, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.0170312353197413, |
|
"grad_norm": 228.1467742919922, |
|
"learning_rate": 6e-05, |
|
"loss": 7.1172, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.0279640242912904, |
|
"grad_norm": 400.675537109375, |
|
"learning_rate": 6e-05, |
|
"loss": 7.1351, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.0388968132628396, |
|
"grad_norm": 293.3075866699219, |
|
"learning_rate": 6e-05, |
|
"loss": 7.1747, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.0498296022343887, |
|
"grad_norm": 439.60760498046875, |
|
"learning_rate": 6e-05, |
|
"loss": 7.1955, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.0607623912059378, |
|
"grad_norm": 336.15521240234375, |
|
"learning_rate": 6e-05, |
|
"loss": 7.2134, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.071695180177487, |
|
"grad_norm": 232.90606689453125, |
|
"learning_rate": 6e-05, |
|
"loss": 7.2589, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.0826279691490361, |
|
"grad_norm": 453.7010803222656, |
|
"learning_rate": 6e-05, |
|
"loss": 7.2537, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.0935607581205853, |
|
"grad_norm": 156.7413330078125, |
|
"learning_rate": 6e-05, |
|
"loss": 7.2678, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0935607581205853, |
|
"eval_loss": 7.271553993225098, |
|
"eval_runtime": 78.5452, |
|
"eval_samples_per_second": 119.243, |
|
"eval_steps_per_second": 14.909, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.1044935470921344, |
|
"grad_norm": 225.9600067138672, |
|
"learning_rate": 6e-05, |
|
"loss": 7.2489, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.1154263360636836, |
|
"grad_norm": 258.6958312988281, |
|
"learning_rate": 6e-05, |
|
"loss": 7.2224, |
|
"step": 1020 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1024, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 1024, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.383804151351214e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|