{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.119799451652303, "eval_steps": 100, "global_step": 1024, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010932788971549125, "grad_norm": 167.07713317871094, "learning_rate": 0.0005999985601583006, "loss": 9.6698, "step": 10 }, { "epoch": 0.02186557794309825, "grad_norm": 83.92709350585938, "learning_rate": 0.0005998257958771109, "loss": 8.2484, "step": 20 }, { "epoch": 0.032798366914647374, "grad_norm": 127.91200256347656, "learning_rate": 0.0005993652532642609, "loss": 7.6452, "step": 30 }, { "epoch": 0.0437311558861965, "grad_norm": 97.33670043945312, "learning_rate": 0.0005986173743570491, "loss": 7.4548, "step": 40 }, { "epoch": 0.054663944857745624, "grad_norm": 127.0005874633789, "learning_rate": 0.0005975828769834513, "loss": 7.3226, "step": 50 }, { "epoch": 0.06559673382929475, "grad_norm": 104.47633361816406, "learning_rate": 0.0005962627540731365, "loss": 7.204, "step": 60 }, { "epoch": 0.07652952280084388, "grad_norm": 164.4477081298828, "learning_rate": 0.0005946582727044349, "loss": 7.1105, "step": 70 }, { "epoch": 0.087462311772393, "grad_norm": 126.8350601196289, "learning_rate": 0.0005927709728881719, "loss": 7.0511, "step": 80 }, { "epoch": 0.09839510074394213, "grad_norm": 158.55856323242188, "learning_rate": 0.0005906026660895383, "loss": 7.0642, "step": 90 }, { "epoch": 0.10932788971549125, "grad_norm": 126.1555404663086, "learning_rate": 0.0005881554334894116, "loss": 7.031, "step": 100 }, { "epoch": 0.10932788971549125, "eval_loss": 7.01555061340332, "eval_runtime": 79.0984, "eval_samples_per_second": 118.409, "eval_steps_per_second": 14.804, "step": 100 }, { "epoch": 0.12026067868704038, "grad_norm": 108.58393096923828, "learning_rate": 0.0005854316239868012, "loss": 7.0123, "step": 110 }, { "epoch": 0.1311934676585895, "grad_norm": 178.0326690673828, "learning_rate": 0.0005824338519443309, "loss": 6.9897, "step": 120 }, { "epoch": 0.14212625663013861, "grad_norm": 192.8655242919922, "learning_rate": 0.0005791649946789259, "loss": 7.0117, "step": 130 }, { "epoch": 0.15305904560168776, "grad_norm": 143.3759002685547, "learning_rate": 0.0005756281897001107, "loss": 7.0073, "step": 140 }, { "epoch": 0.16399183457323688, "grad_norm": 171.0679168701172, "learning_rate": 0.0005718268316985698, "loss": 6.9843, "step": 150 }, { "epoch": 0.174924623544786, "grad_norm": 164.86534118652344, "learning_rate": 0.0005677645692878606, "loss": 7.0083, "step": 160 }, { "epoch": 0.1858574125163351, "grad_norm": 125.85225677490234, "learning_rate": 0.000563445301502407, "loss": 7.02, "step": 170 }, { "epoch": 0.19679020148788426, "grad_norm": 144.15589904785156, "learning_rate": 0.0005588731740551344, "loss": 6.9773, "step": 180 }, { "epoch": 0.20772299045943338, "grad_norm": 108.05564880371094, "learning_rate": 0.0005540525753583378, "loss": 6.9632, "step": 190 }, { "epoch": 0.2186557794309825, "grad_norm": 146.53924560546875, "learning_rate": 0.0005489881323116018, "loss": 6.929, "step": 200 }, { "epoch": 0.2186557794309825, "eval_loss": 6.925621509552002, "eval_runtime": 78.9467, "eval_samples_per_second": 118.637, "eval_steps_per_second": 14.833, "step": 200 }, { "epoch": 0.2295885684025316, "grad_norm": 204.57968139648438, "learning_rate": 0.0005436847058608189, "loss": 6.9631, "step": 210 }, { "epoch": 0.24052135737408076, "grad_norm": 171.31556701660156, "learning_rate": 0.0005381473863325621, "loss": 7.0389, "step": 220 }, { "epoch": 0.25145414634562985, "grad_norm": 142.57449340820312, "learning_rate": 0.0005323814885482963, "loss": 6.967, "step": 230 }, { "epoch": 0.262386935317179, "grad_norm": 119.19646453857422, "learning_rate": 0.000526392546723115, "loss": 6.9456, "step": 240 }, { "epoch": 0.27331972428872814, "grad_norm": 153.62359619140625, "learning_rate": 0.0005201863091538979, "loss": 6.9686, "step": 250 }, { "epoch": 0.28425251326027723, "grad_norm": 150.35699462890625, "learning_rate": 0.000513768732701989, "loss": 6.9846, "step": 260 }, { "epoch": 0.2951853022318264, "grad_norm": 215.55368041992188, "learning_rate": 0.0005071459770756929, "loss": 6.9968, "step": 270 }, { "epoch": 0.3061180912033755, "grad_norm": 107.55154418945312, "learning_rate": 0.0005003243989180711, "loss": 7.0033, "step": 280 }, { "epoch": 0.3170508801749246, "grad_norm": 190.4154052734375, "learning_rate": 0.0004933105457057203, "loss": 6.9816, "step": 290 }, { "epoch": 0.32798366914647376, "grad_norm": 159.7703094482422, "learning_rate": 0.0004861111494643821, "loss": 7.0486, "step": 300 }, { "epoch": 0.32798366914647376, "eval_loss": 7.4869384765625, "eval_runtime": 79.1717, "eval_samples_per_second": 118.3, "eval_steps_per_second": 14.791, "step": 300 }, { "epoch": 0.3389164581180229, "grad_norm": 218.22604370117188, "learning_rate": 0.0004794787611927562, "loss": 7.2679, "step": 310 }, { "epoch": 0.349849247089572, "grad_norm": 182.51431274414062, "learning_rate": 0.0004719460124060748, "loss": 7.1809, "step": 320 }, { "epoch": 0.36078203606112114, "grad_norm": 137.0953826904297, "learning_rate": 0.0004642482266637136, "loss": 7.0417, "step": 330 }, { "epoch": 0.3717148250326702, "grad_norm": 92.07840728759766, "learning_rate": 0.0004563927924424775, "loss": 6.9309, "step": 340 }, { "epoch": 0.38264761400421937, "grad_norm": 147.35975646972656, "learning_rate": 0.00044838724953309093, "loss": 6.8844, "step": 350 }, { "epoch": 0.3935804029757685, "grad_norm": 262.996337890625, "learning_rate": 0.0004402392818033671, "loss": 6.966, "step": 360 }, { "epoch": 0.4045131919473176, "grad_norm": 155.3452606201172, "learning_rate": 0.00043195670982308984, "loss": 7.0715, "step": 370 }, { "epoch": 0.41544598091886675, "grad_norm": 129.5069580078125, "learning_rate": 0.00042354748335768664, "loss": 7.0806, "step": 380 }, { "epoch": 0.4263787698904159, "grad_norm": 92.96502685546875, "learning_rate": 0.0004150196737378971, "loss": 6.9999, "step": 390 }, { "epoch": 0.437311558861965, "grad_norm": 120.41193389892578, "learning_rate": 0.0004063814661127606, "loss": 6.9339, "step": 400 }, { "epoch": 0.437311558861965, "eval_loss": 6.931961536407471, "eval_runtime": 78.8373, "eval_samples_per_second": 118.802, "eval_steps_per_second": 14.853, "step": 400 }, { "epoch": 0.44824434783351413, "grad_norm": 188.7049560546875, "learning_rate": 0.00039764115159335935, "loss": 6.9242, "step": 410 }, { "epoch": 0.4591771368050632, "grad_norm": 131.7518768310547, "learning_rate": 0.0003888071192948565, "loss": 6.9815, "step": 420 }, { "epoch": 0.47010992577661237, "grad_norm": 247.91549682617188, "learning_rate": 0.0003798878482844695, "loss": 7.0838, "step": 430 }, { "epoch": 0.4810427147481615, "grad_norm": 135.4517364501953, "learning_rate": 0.000370891899443104, "loss": 7.1813, "step": 440 }, { "epoch": 0.4919755037197106, "grad_norm": 99.5172119140625, "learning_rate": 0.00036182790724846315, "loss": 7.1557, "step": 450 }, { "epoch": 0.5029082926912597, "grad_norm": 165.1914825439453, "learning_rate": 0.00035270457148751575, "loss": 7.0382, "step": 460 }, { "epoch": 0.5138410816628088, "grad_norm": 128.59959411621094, "learning_rate": 0.00034353064890628107, "loss": 7.0597, "step": 470 }, { "epoch": 0.524773870634358, "grad_norm": 142.37147521972656, "learning_rate": 0.00033431494480494175, "loss": 7.092, "step": 480 }, { "epoch": 0.5357066596059071, "grad_norm": 217.4059295654297, "learning_rate": 0.0003250663045863544, "loss": 7.0457, "step": 490 }, { "epoch": 0.5466394485774563, "grad_norm": 125.81988525390625, "learning_rate": 0.0003157936052660688, "loss": 7.0112, "step": 500 }, { "epoch": 0.5466394485774563, "eval_loss": 7.004736423492432, "eval_runtime": 78.8041, "eval_samples_per_second": 118.852, "eval_steps_per_second": 14.86, "step": 500 }, { "epoch": 0.5575722375490054, "grad_norm": 170.00523376464844, "learning_rate": 0.0003065057469520046, "loss": 7.0162, "step": 510 }, { "epoch": 0.5685050265205545, "grad_norm": 216.81466674804688, "learning_rate": 0.0002972116443019633, "loss": 7.0584, "step": 520 }, { "epoch": 0.5794378154921036, "grad_norm": 239.21087646484375, "learning_rate": 0.0002879202179671755, "loss": 7.1254, "step": 530 }, { "epoch": 0.5903706044636527, "grad_norm": 190.0070343017578, "learning_rate": 0.00027864038603009453, "loss": 7.1717, "step": 540 }, { "epoch": 0.6013033934352019, "grad_norm": 179.18785095214844, "learning_rate": 0.00026938105544465745, "loss": 7.1185, "step": 550 }, { "epoch": 0.612236182406751, "grad_norm": 279.44781494140625, "learning_rate": 0.0002601511134872255, "loss": 7.0727, "step": 560 }, { "epoch": 0.6231689713783001, "grad_norm": 227.90072631835938, "learning_rate": 0.0002509594192264121, "loss": 7.1088, "step": 570 }, { "epoch": 0.6341017603498492, "grad_norm": 173.11819458007812, "learning_rate": 0.0002418147950199862, "loss": 7.0927, "step": 580 }, { "epoch": 0.6450345493213984, "grad_norm": 164.40736389160156, "learning_rate": 0.00023272601804700946, "loss": 7.0701, "step": 590 }, { "epoch": 0.6559673382929475, "grad_norm": 123.35533142089844, "learning_rate": 0.0002237018118833387, "loss": 7.0496, "step": 600 }, { "epoch": 0.6559673382929475, "eval_loss": 7.052866458892822, "eval_runtime": 78.8887, "eval_samples_per_second": 118.724, "eval_steps_per_second": 14.844, "step": 600 }, { "epoch": 0.6669001272644967, "grad_norm": 225.67015075683594, "learning_rate": 0.0002147508381285762, "loss": 7.04, "step": 610 }, { "epoch": 0.6778329162360458, "grad_norm": 140.2364501953125, "learning_rate": 0.00020588168809250687, "loss": 7.0902, "step": 620 }, { "epoch": 0.6887657052075948, "grad_norm": 262.8550720214844, "learning_rate": 0.00019710287454900033, "loss": 7.1224, "step": 630 }, { "epoch": 0.699698494179144, "grad_norm": 150.97813415527344, "learning_rate": 0.00018842282356529402, "loss": 7.1802, "step": 640 }, { "epoch": 0.7106312831506931, "grad_norm": 452.73431396484375, "learning_rate": 0.00017984986641449754, "loss": 7.1497, "step": 650 }, { "epoch": 0.7215640721222423, "grad_norm": 138.37220764160156, "learning_rate": 0.00017139223157908368, "loss": 7.1715, "step": 660 }, { "epoch": 0.7324968610937914, "grad_norm": 144.21133422851562, "learning_rate": 0.00016305803685303906, "loss": 7.1458, "step": 670 }, { "epoch": 0.7434296500653405, "grad_norm": 142.4859161376953, "learning_rate": 0.00015485528155025473, "loss": 7.1041, "step": 680 }, { "epoch": 0.7543624390368896, "grad_norm": 190.189208984375, "learning_rate": 0.00014679183882663872, "loss": 7.0798, "step": 690 }, { "epoch": 0.7652952280084387, "grad_norm": 160.14442443847656, "learning_rate": 0.0001388754481233139, "loss": 7.074, "step": 700 }, { "epoch": 0.7652952280084387, "eval_loss": 7.0790934562683105, "eval_runtime": 79.0053, "eval_samples_per_second": 118.549, "eval_steps_per_second": 14.822, "step": 700 }, { "epoch": 0.7762280169799879, "grad_norm": 173.01499938964844, "learning_rate": 0.0001311137077381614, "loss": 7.0821, "step": 710 }, { "epoch": 0.787160805951537, "grad_norm": 156.1138458251953, "learning_rate": 0.00012351406753283216, "loss": 7.0838, "step": 720 }, { "epoch": 0.7980935949230861, "grad_norm": 161.9981689453125, "learning_rate": 0.00011681901904809884, "loss": 7.0639, "step": 730 }, { "epoch": 0.8090263838946352, "grad_norm": 174.0237579345703, "learning_rate": 0.00010954733067505213, "loss": 7.0604, "step": 740 }, { "epoch": 0.8199591728661844, "grad_norm": 141.823974609375, "learning_rate": 0.0001024584422885053, "loss": 7.0508, "step": 750 }, { "epoch": 0.8308919618377335, "grad_norm": 121.39106750488281, "learning_rate": 9.555915793434476e-05, "loss": 7.0568, "step": 760 }, { "epoch": 0.8418247508092827, "grad_norm": 178.37924194335938, "learning_rate": 8.885609967300851e-05, "loss": 7.0589, "step": 770 }, { "epoch": 0.8527575397808318, "grad_norm": 304.8969421386719, "learning_rate": 8.235570122350937e-05, "loss": 7.0582, "step": 780 }, { "epoch": 0.8636903287523808, "grad_norm": 128.75843811035156, "learning_rate": 7.606420178823293e-05, "loss": 7.0622, "step": 790 }, { "epoch": 0.87462311772393, "grad_norm": 88.88775634765625, "learning_rate": 6.998764006443615e-05, "loss": 7.0664, "step": 800 }, { "epoch": 0.87462311772393, "eval_loss": 7.048069477081299, "eval_runtime": 78.7086, "eval_samples_per_second": 118.996, "eval_steps_per_second": 14.878, "step": 800 }, { "epoch": 0.8855559066954791, "grad_norm": 131.33584594726562, "learning_rate": 6.413184844819423e-05, "loss": 7.0381, "step": 810 }, { "epoch": 0.8964886956670283, "grad_norm": 176.8515625, "learning_rate": 6e-05, "loss": 7.0461, "step": 820 }, { "epoch": 0.9074214846385774, "grad_norm": 128.32069396972656, "learning_rate": 6e-05, "loss": 7.0597, "step": 830 }, { "epoch": 0.9183542736101264, "grad_norm": 150.107421875, "learning_rate": 6e-05, "loss": 7.0582, "step": 840 }, { "epoch": 0.9292870625816756, "grad_norm": 174.95352172851562, "learning_rate": 6e-05, "loss": 7.0729, "step": 850 }, { "epoch": 0.9402198515532247, "grad_norm": 209.878173828125, "learning_rate": 6e-05, "loss": 7.0949, "step": 860 }, { "epoch": 0.9511526405247739, "grad_norm": 181.1326904296875, "learning_rate": 6e-05, "loss": 7.109, "step": 870 }, { "epoch": 0.962085429496323, "grad_norm": 197.11639404296875, "learning_rate": 6e-05, "loss": 7.1132, "step": 880 }, { "epoch": 0.9730182184678722, "grad_norm": 197.16473388671875, "learning_rate": 6e-05, "loss": 7.1008, "step": 890 }, { "epoch": 0.9839510074394212, "grad_norm": 224.1211395263672, "learning_rate": 6e-05, "loss": 7.1024, "step": 900 }, { "epoch": 0.9839510074394212, "eval_loss": 7.119234561920166, "eval_runtime": 78.4745, "eval_samples_per_second": 119.351, "eval_steps_per_second": 14.922, "step": 900 }, { "epoch": 0.9948837964109704, "grad_norm": 161.86753845214844, "learning_rate": 6e-05, "loss": 7.1127, "step": 910 }, { "epoch": 1.0060984463481923, "grad_norm": 247.6467742919922, "learning_rate": 6e-05, "loss": 7.1115, "step": 920 }, { "epoch": 1.0170312353197413, "grad_norm": 228.1467742919922, "learning_rate": 6e-05, "loss": 7.1172, "step": 930 }, { "epoch": 1.0279640242912904, "grad_norm": 400.675537109375, "learning_rate": 6e-05, "loss": 7.1351, "step": 940 }, { "epoch": 1.0388968132628396, "grad_norm": 293.3075866699219, "learning_rate": 6e-05, "loss": 7.1747, "step": 950 }, { "epoch": 1.0498296022343887, "grad_norm": 439.60760498046875, "learning_rate": 6e-05, "loss": 7.1955, "step": 960 }, { "epoch": 1.0607623912059378, "grad_norm": 336.15521240234375, "learning_rate": 6e-05, "loss": 7.2134, "step": 970 }, { "epoch": 1.071695180177487, "grad_norm": 232.90606689453125, "learning_rate": 6e-05, "loss": 7.2589, "step": 980 }, { "epoch": 1.0826279691490361, "grad_norm": 453.7010803222656, "learning_rate": 6e-05, "loss": 7.2537, "step": 990 }, { "epoch": 1.0935607581205853, "grad_norm": 156.7413330078125, "learning_rate": 6e-05, "loss": 7.2678, "step": 1000 }, { "epoch": 1.0935607581205853, "eval_loss": 7.271553993225098, "eval_runtime": 78.5452, "eval_samples_per_second": 119.243, "eval_steps_per_second": 14.909, "step": 1000 }, { "epoch": 1.1044935470921344, "grad_norm": 225.9600067138672, "learning_rate": 6e-05, "loss": 7.2489, "step": 1010 }, { "epoch": 1.1154263360636836, "grad_norm": 258.6958312988281, "learning_rate": 6e-05, "loss": 7.2224, "step": 1020 } ], "logging_steps": 10, "max_steps": 1024, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.383804151351214e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }