{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.994667614646285, "eval_steps": 1000, "global_step": 1053, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028439388553146108, "grad_norm": 2.7864015102386475, "learning_rate": 9.433962264150944e-06, "loss": 3.2455, "step": 10 }, { "epoch": 0.056878777106292217, "grad_norm": 0.8451234698295593, "learning_rate": 1.8867924528301888e-05, "loss": 2.8076, "step": 20 }, { "epoch": 0.08531816565943832, "grad_norm": 0.4598900079727173, "learning_rate": 2.830188679245283e-05, "loss": 2.5631, "step": 30 }, { "epoch": 0.11375755421258443, "grad_norm": 0.3573364317417145, "learning_rate": 3.7735849056603776e-05, "loss": 2.4723, "step": 40 }, { "epoch": 0.14219694276573053, "grad_norm": 0.3167368173599243, "learning_rate": 4.716981132075472e-05, "loss": 2.3806, "step": 50 }, { "epoch": 0.17063633131887665, "grad_norm": 0.3523406982421875, "learning_rate": 4.999395511092461e-05, "loss": 2.3182, "step": 60 }, { "epoch": 0.19907571987202274, "grad_norm": 0.3779957592487335, "learning_rate": 4.996435452798774e-05, "loss": 2.2473, "step": 70 }, { "epoch": 0.22751510842516887, "grad_norm": 0.4943179786205292, "learning_rate": 4.991011714111481e-05, "loss": 2.199, "step": 80 }, { "epoch": 0.25595449697831496, "grad_norm": 0.42343392968177795, "learning_rate": 4.9831296476058484e-05, "loss": 2.187, "step": 90 }, { "epoch": 0.28439388553146105, "grad_norm": 0.4268713593482971, "learning_rate": 4.9727970319299044e-05, "loss": 2.1194, "step": 100 }, { "epoch": 0.3128332740846072, "grad_norm": 0.48644664883613586, "learning_rate": 4.9600240641278496e-05, "loss": 2.1086, "step": 110 }, { "epoch": 0.3412726626377533, "grad_norm": 0.48994767665863037, "learning_rate": 4.944823349576805e-05, "loss": 2.0861, "step": 120 }, { "epoch": 0.3697120511908994, "grad_norm": 0.5868293642997742, "learning_rate": 4.9272098895468277e-05, "loss": 2.0809, "step": 130 }, { "epoch": 0.3981514397440455, "grad_norm": 0.5047050714492798, "learning_rate": 4.907201066396469e-05, "loss": 2.0588, "step": 140 }, { "epoch": 0.42659082829719164, "grad_norm": 0.4889000654220581, "learning_rate": 4.8848166264184844e-05, "loss": 2.019, "step": 150 }, { "epoch": 0.45503021685033773, "grad_norm": 0.5262014269828796, "learning_rate": 4.860078660352625e-05, "loss": 2.0294, "step": 160 }, { "epoch": 0.4834696054034838, "grad_norm": 0.4898081123828888, "learning_rate": 4.8330115815847465e-05, "loss": 1.9942, "step": 170 }, { "epoch": 0.5119089939566299, "grad_norm": 0.5089215040206909, "learning_rate": 4.803642102053746e-05, "loss": 1.9974, "step": 180 }, { "epoch": 0.540348382509776, "grad_norm": 0.5131760239601135, "learning_rate": 4.7719992058901006e-05, "loss": 1.9876, "step": 190 }, { "epoch": 0.5687877710629221, "grad_norm": 0.4940701723098755, "learning_rate": 4.7381141208120296e-05, "loss": 1.9438, "step": 200 }, { "epoch": 0.5972271596160682, "grad_norm": 0.5069682002067566, "learning_rate": 4.702020287307509e-05, "loss": 1.9668, "step": 210 }, { "epoch": 0.6256665481692144, "grad_norm": 0.5188295245170593, "learning_rate": 4.663753325632548e-05, "loss": 1.961, "step": 220 }, { "epoch": 0.6541059367223605, "grad_norm": 0.5184710025787354, "learning_rate": 4.6233510006582914e-05, "loss": 1.9327, "step": 230 }, { "epoch": 0.6825453252755066, "grad_norm": 0.5142987370491028, "learning_rate": 4.580853184601659e-05, "loss": 1.9415, "step": 240 }, { "epoch": 0.7109847138286527, "grad_norm": 0.5494813919067383, "learning_rate": 4.536301817676274e-05, "loss": 1.918, "step": 250 }, { "epoch": 0.7394241023817988, "grad_norm": 0.49111875891685486, "learning_rate": 4.48974086670254e-05, "loss": 1.9406, "step": 260 }, { "epoch": 0.7678634909349449, "grad_norm": 0.513344407081604, "learning_rate": 4.4412162817176965e-05, "loss": 1.9019, "step": 270 }, { "epoch": 0.796302879488091, "grad_norm": 0.5305178761482239, "learning_rate": 4.39077595062868e-05, "loss": 1.8887, "step": 280 }, { "epoch": 0.8247422680412371, "grad_norm": 0.5225845575332642, "learning_rate": 4.33846965195254e-05, "loss": 1.8942, "step": 290 }, { "epoch": 0.8531816565943833, "grad_norm": 0.5039849281311035, "learning_rate": 4.2843490056910534e-05, "loss": 1.8961, "step": 300 }, { "epoch": 0.8816210451475294, "grad_norm": 0.5015091896057129, "learning_rate": 4.228467422388016e-05, "loss": 1.9114, "step": 310 }, { "epoch": 0.9100604337006755, "grad_norm": 0.4691685140132904, "learning_rate": 4.1708800504194827e-05, "loss": 1.9162, "step": 320 }, { "epoch": 0.9384998222538216, "grad_norm": 0.48344337940216064, "learning_rate": 4.1116437215689784e-05, "loss": 1.8799, "step": 330 }, { "epoch": 0.9669392108069677, "grad_norm": 0.5020110011100769, "learning_rate": 4.0508168949413906e-05, "loss": 1.8565, "step": 340 }, { "epoch": 0.9953785993601137, "grad_norm": 0.514559268951416, "learning_rate": 3.988459599270888e-05, "loss": 1.9027, "step": 350 }, { "epoch": 1.0238179879132598, "grad_norm": 0.5024710893630981, "learning_rate": 3.9246333736798095e-05, "loss": 1.9138, "step": 360 }, { "epoch": 1.052257376466406, "grad_norm": 0.515842854976654, "learning_rate": 3.859401206946982e-05, "loss": 1.813, "step": 370 }, { "epoch": 1.080696765019552, "grad_norm": 0.5406286716461182, "learning_rate": 3.792827475345393e-05, "loss": 1.8395, "step": 380 }, { "epoch": 1.1091361535726982, "grad_norm": 0.5171347260475159, "learning_rate": 3.724977879110591e-05, "loss": 1.8314, "step": 390 }, { "epoch": 1.1375755421258442, "grad_norm": 0.5132310390472412, "learning_rate": 3.6559193776024794e-05, "loss": 1.8241, "step": 400 }, { "epoch": 1.1660149306789904, "grad_norm": 0.5178537368774414, "learning_rate": 3.585720123224512e-05, "loss": 1.8178, "step": 410 }, { "epoch": 1.1944543192321366, "grad_norm": 0.5596044063568115, "learning_rate": 3.5144493941655e-05, "loss": 1.8174, "step": 420 }, { "epoch": 1.2228937077852826, "grad_norm": 0.5178967714309692, "learning_rate": 3.442177526030407e-05, "loss": 1.7867, "step": 430 }, { "epoch": 1.2513330963384286, "grad_norm": 0.5135601758956909, "learning_rate": 3.3689758424275926e-05, "loss": 1.791, "step": 440 }, { "epoch": 1.2797724848915748, "grad_norm": 0.5257358551025391, "learning_rate": 3.294916584581027e-05, "loss": 1.8153, "step": 450 }, { "epoch": 1.308211873444721, "grad_norm": 0.5272982716560364, "learning_rate": 3.220072840036923e-05, "loss": 1.7952, "step": 460 }, { "epoch": 1.336651261997867, "grad_norm": 0.5305171012878418, "learning_rate": 3.14451847053515e-05, "loss": 1.7962, "step": 470 }, { "epoch": 1.3650906505510132, "grad_norm": 0.5360648036003113, "learning_rate": 3.068328039116616e-05, "loss": 1.8002, "step": 480 }, { "epoch": 1.3935300391041592, "grad_norm": 0.5958048105239868, "learning_rate": 2.99157673653855e-05, "loss": 1.8017, "step": 490 }, { "epoch": 1.4219694276573054, "grad_norm": 0.5489828586578369, "learning_rate": 2.9143403070702997e-05, "loss": 1.7931, "step": 500 }, { "epoch": 1.4504088162104516, "grad_norm": 0.5472132563591003, "learning_rate": 2.8366949737428817e-05, "loss": 1.8051, "step": 510 }, { "epoch": 1.4788482047635976, "grad_norm": 0.5372362732887268, "learning_rate": 2.7587173631260566e-05, "loss": 1.7962, "step": 520 }, { "epoch": 1.5072875933167436, "grad_norm": 0.5616655349731445, "learning_rate": 2.6804844297071526e-05, "loss": 1.7763, "step": 530 }, { "epoch": 1.5357269818698898, "grad_norm": 0.5398069620132446, "learning_rate": 2.6020733799462754e-05, "loss": 1.7808, "step": 540 }, { "epoch": 1.564166370423036, "grad_norm": 0.5471286773681641, "learning_rate": 2.5235615960828605e-05, "loss": 1.7836, "step": 550 }, { "epoch": 1.5926057589761822, "grad_norm": 0.5741537809371948, "learning_rate": 2.4450265597687376e-05, "loss": 1.8075, "step": 560 }, { "epoch": 1.6210451475293282, "grad_norm": 0.5587407946586609, "learning_rate": 2.3665457756030988e-05, "loss": 1.7669, "step": 570 }, { "epoch": 1.6494845360824741, "grad_norm": 0.5349502563476562, "learning_rate": 2.2881966946448167e-05, "loss": 1.7712, "step": 580 }, { "epoch": 1.6779239246356203, "grad_norm": 0.5548702478408813, "learning_rate": 2.2100566379775967e-05, "loss": 1.7669, "step": 590 }, { "epoch": 1.7063633131887666, "grad_norm": 0.5402134656906128, "learning_rate": 2.1322027204034066e-05, "loss": 1.7754, "step": 600 }, { "epoch": 1.7348027017419125, "grad_norm": 0.5542489290237427, "learning_rate": 2.0547117743394744e-05, "loss": 1.7959, "step": 610 }, { "epoch": 1.7632420902950585, "grad_norm": 0.6034718751907349, "learning_rate": 1.9776602739939714e-05, "loss": 1.7631, "step": 620 }, { "epoch": 1.7916814788482047, "grad_norm": 0.5417160987854004, "learning_rate": 1.9011242598951962e-05, "loss": 1.7681, "step": 630 }, { "epoch": 1.820120867401351, "grad_norm": 0.5751153826713562, "learning_rate": 1.8251792638487596e-05, "loss": 1.7717, "step": 640 }, { "epoch": 1.8485602559544971, "grad_norm": 0.517077624797821, "learning_rate": 1.7499002343968098e-05, "loss": 1.7571, "step": 650 }, { "epoch": 1.8769996445076431, "grad_norm": 0.5594078302383423, "learning_rate": 1.675361462852868e-05, "loss": 1.7528, "step": 660 }, { "epoch": 1.905439033060789, "grad_norm": 0.5302609801292419, "learning_rate": 1.6016365099852735e-05, "loss": 1.7454, "step": 670 }, { "epoch": 1.9338784216139353, "grad_norm": 0.5613446235656738, "learning_rate": 1.528798133421585e-05, "loss": 1.7555, "step": 680 }, { "epoch": 1.9623178101670815, "grad_norm": 0.5346989631652832, "learning_rate": 1.4569182158455875e-05, "loss": 1.7334, "step": 690 }, { "epoch": 1.9907571987202275, "grad_norm": 0.5548744201660156, "learning_rate": 1.3860676940577594e-05, "loss": 1.7764, "step": 700 }, { "epoch": 2.0191965872733735, "grad_norm": 0.6024349331855774, "learning_rate": 1.3163164889692197e-05, "loss": 1.7899, "step": 710 }, { "epoch": 2.0476359758265197, "grad_norm": 0.583003044128418, "learning_rate": 1.2477334365982248e-05, "loss": 1.7026, "step": 720 }, { "epoch": 2.076075364379666, "grad_norm": 0.572210967540741, "learning_rate": 1.1803862201373342e-05, "loss": 1.6817, "step": 730 }, { "epoch": 2.104514752932812, "grad_norm": 0.567309558391571, "learning_rate": 1.1143413031582645e-05, "loss": 1.702, "step": 740 }, { "epoch": 2.132954141485958, "grad_norm": 0.5702211856842041, "learning_rate": 1.0496638640203774e-05, "loss": 1.7001, "step": 750 }, { "epoch": 2.161393530039104, "grad_norm": 0.598107635974884, "learning_rate": 9.864177315474968e-06, "loss": 1.6904, "step": 760 }, { "epoch": 2.1898329185922503, "grad_norm": 0.5752361416816711, "learning_rate": 9.246653220365778e-06, "loss": 1.7187, "step": 770 }, { "epoch": 2.2182723071453965, "grad_norm": 0.5880258679389954, "learning_rate": 8.644675776603476e-06, "loss": 1.6973, "step": 780 }, { "epoch": 2.2467116956985427, "grad_norm": 0.5714329481124878, "learning_rate": 8.058839063247447e-06, "loss": 1.706, "step": 790 }, { "epoch": 2.2751510842516884, "grad_norm": 0.6069587469100952, "learning_rate": 7.489721230404842e-06, "loss": 1.7323, "step": 800 }, { "epoch": 2.3035904728048346, "grad_norm": 0.5789757966995239, "learning_rate": 6.937883928666255e-06, "loss": 1.7076, "step": 810 }, { "epoch": 2.332029861357981, "grad_norm": 0.6063619256019592, "learning_rate": 6.403871754824373e-06, "loss": 1.6819, "step": 820 }, { "epoch": 2.360469249911127, "grad_norm": 0.5955121517181396, "learning_rate": 5.8882117144227115e-06, "loss": 1.6991, "step": 830 }, { "epoch": 2.3889086384642733, "grad_norm": 0.5998035073280334, "learning_rate": 5.391412701664744e-06, "loss": 1.6747, "step": 840 }, { "epoch": 2.417348027017419, "grad_norm": 0.6063375473022461, "learning_rate": 4.91396499719681e-06, "loss": 1.7041, "step": 850 }, { "epoch": 2.4457874155705652, "grad_norm": 0.6151806712150574, "learning_rate": 4.456339784260247e-06, "loss": 1.6868, "step": 860 }, { "epoch": 2.4742268041237114, "grad_norm": 0.5840434432029724, "learning_rate": 4.018988683690461e-06, "loss": 1.6757, "step": 870 }, { "epoch": 2.502666192676857, "grad_norm": 0.5928480625152588, "learning_rate": 3.6023433082216755e-06, "loss": 1.702, "step": 880 }, { "epoch": 2.5311055812300034, "grad_norm": 0.5977081656455994, "learning_rate": 3.2068148365372806e-06, "loss": 1.7001, "step": 890 }, { "epoch": 2.5595449697831496, "grad_norm": 0.6291081309318542, "learning_rate": 2.832793607486087e-06, "loss": 1.7146, "step": 900 }, { "epoch": 2.587984358336296, "grad_norm": 0.6002483367919922, "learning_rate": 2.4806487348650485e-06, "loss": 1.6753, "step": 910 }, { "epoch": 2.616423746889442, "grad_norm": 0.5828536748886108, "learning_rate": 2.150727743148473e-06, "loss": 1.6863, "step": 920 }, { "epoch": 2.6448631354425878, "grad_norm": 0.6066195964813232, "learning_rate": 1.8433562245233349e-06, "loss": 1.658, "step": 930 }, { "epoch": 2.673302523995734, "grad_norm": 0.6164836287498474, "learning_rate": 1.5588375175691117e-06, "loss": 1.6957, "step": 940 }, { "epoch": 2.70174191254888, "grad_norm": 0.6047420501708984, "learning_rate": 1.2974524078991995e-06, "loss": 1.677, "step": 950 }, { "epoch": 2.7301813011020264, "grad_norm": 0.5752055048942566, "learning_rate": 1.0594588510594445e-06, "loss": 1.6802, "step": 960 }, { "epoch": 2.7586206896551726, "grad_norm": 0.5959452986717224, "learning_rate": 8.450917179571305e-07, "loss": 1.6897, "step": 970 }, { "epoch": 2.7870600782083184, "grad_norm": 0.5909944176673889, "learning_rate": 6.545625630717783e-07, "loss": 1.6916, "step": 980 }, { "epoch": 2.8154994667614646, "grad_norm": 0.601028323173523, "learning_rate": 4.880594156763896e-07, "loss": 1.7078, "step": 990 }, { "epoch": 2.8439388553146108, "grad_norm": 0.5776922702789307, "learning_rate": 3.4574659427528133e-07, "loss": 1.6961, "step": 1000 }, { "epoch": 2.8439388553146108, "eval_loss": 1.7827636003494263, "eval_runtime": 74.8387, "eval_samples_per_second": 133.621, "eval_steps_per_second": 4.182, "step": 1000 }, { "epoch": 2.872378243867757, "grad_norm": 0.6100145578384399, "learning_rate": 2.2776454444153328e-07, "loss": 1.6978, "step": 1010 }, { "epoch": 2.900817632420903, "grad_norm": 0.6016635894775391, "learning_rate": 1.342297002141918e-07, "loss": 1.6623, "step": 1020 }, { "epoch": 2.929257020974049, "grad_norm": 0.5942381620407104, "learning_rate": 6.523436919190773e-08, "loss": 1.6999, "step": 1030 }, { "epoch": 2.957696409527195, "grad_norm": 0.6014872789382935, "learning_rate": 2.0846641436497726e-08, "loss": 1.6816, "step": 1040 }, { "epoch": 2.9861357980803414, "grad_norm": 0.6061838269233704, "learning_rate": 1.1103222762542941e-09, "loss": 1.7004, "step": 1050 }, { "epoch": 2.994667614646285, "step": 1053, "total_flos": 4.229776342129836e+18, "train_loss": 1.8626822329427895, "train_runtime": 5321.5336, "train_samples_per_second": 50.737, "train_steps_per_second": 0.198 } ], "logging_steps": 10, "max_steps": 1053, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.229776342129836e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }