{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996331621423331, "eval_steps": 500, "global_step": 1020, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.029347028613352897, "grad_norm": 0.46461818543604894, "learning_rate": 5e-06, "loss": 0.7953, "step": 10 }, { "epoch": 0.058694057226705794, "grad_norm": 0.3239446159431779, "learning_rate": 5e-06, "loss": 0.7149, "step": 20 }, { "epoch": 0.0880410858400587, "grad_norm": 0.32216574679765225, "learning_rate": 5e-06, "loss": 0.6921, "step": 30 }, { "epoch": 0.11738811445341159, "grad_norm": 0.2480001809753486, "learning_rate": 5e-06, "loss": 0.6745, "step": 40 }, { "epoch": 0.1467351430667645, "grad_norm": 0.24507343085621888, "learning_rate": 5e-06, "loss": 0.6665, "step": 50 }, { "epoch": 0.1760821716801174, "grad_norm": 0.23701505316070723, "learning_rate": 5e-06, "loss": 0.6641, "step": 60 }, { "epoch": 0.20542920029347028, "grad_norm": 0.2577847358912369, "learning_rate": 5e-06, "loss": 0.6518, "step": 70 }, { "epoch": 0.23477622890682318, "grad_norm": 0.22368687865780756, "learning_rate": 5e-06, "loss": 0.6571, "step": 80 }, { "epoch": 0.26412325752017607, "grad_norm": 0.24083896568036336, "learning_rate": 5e-06, "loss": 0.6508, "step": 90 }, { "epoch": 0.293470286133529, "grad_norm": 0.2372190759345622, "learning_rate": 5e-06, "loss": 0.6359, "step": 100 }, { "epoch": 0.32281731474688186, "grad_norm": 0.2667819914563663, "learning_rate": 5e-06, "loss": 0.6406, "step": 110 }, { "epoch": 0.3521643433602348, "grad_norm": 0.23826118875032948, "learning_rate": 5e-06, "loss": 0.6322, "step": 120 }, { "epoch": 0.3815113719735877, "grad_norm": 0.23786844993350983, "learning_rate": 5e-06, "loss": 0.6293, "step": 130 }, { "epoch": 0.41085840058694056, "grad_norm": 0.2470169393593649, "learning_rate": 5e-06, "loss": 0.6311, "step": 140 }, { "epoch": 0.4402054292002935, "grad_norm": 0.2569758849608478, "learning_rate": 5e-06, "loss": 0.6284, "step": 150 }, { "epoch": 0.46955245781364635, "grad_norm": 0.23082882078276606, "learning_rate": 5e-06, "loss": 0.6191, "step": 160 }, { "epoch": 0.4988994864269993, "grad_norm": 0.2328305538120993, "learning_rate": 5e-06, "loss": 0.6284, "step": 170 }, { "epoch": 0.5282465150403521, "grad_norm": 0.23519449677164506, "learning_rate": 5e-06, "loss": 0.6154, "step": 180 }, { "epoch": 0.5575935436537051, "grad_norm": 0.23095515428388813, "learning_rate": 5e-06, "loss": 0.6227, "step": 190 }, { "epoch": 0.586940572267058, "grad_norm": 0.24860467094098176, "learning_rate": 5e-06, "loss": 0.6111, "step": 200 }, { "epoch": 0.6162876008804109, "grad_norm": 0.26087524684118224, "learning_rate": 5e-06, "loss": 0.6202, "step": 210 }, { "epoch": 0.6456346294937637, "grad_norm": 0.25517651029234445, "learning_rate": 5e-06, "loss": 0.6157, "step": 220 }, { "epoch": 0.6749816581071166, "grad_norm": 0.248351738072603, "learning_rate": 5e-06, "loss": 0.6138, "step": 230 }, { "epoch": 0.7043286867204696, "grad_norm": 0.23661011747989424, "learning_rate": 5e-06, "loss": 0.6147, "step": 240 }, { "epoch": 0.7336757153338225, "grad_norm": 0.2419884779989508, "learning_rate": 5e-06, "loss": 0.6121, "step": 250 }, { "epoch": 0.7630227439471754, "grad_norm": 0.2457790717128415, "learning_rate": 5e-06, "loss": 0.6088, "step": 260 }, { "epoch": 0.7923697725605282, "grad_norm": 0.23336514475272613, "learning_rate": 5e-06, "loss": 0.6002, "step": 270 }, { "epoch": 0.8217168011738811, "grad_norm": 0.25601042114743383, "learning_rate": 5e-06, "loss": 0.5998, "step": 280 }, { "epoch": 0.851063829787234, "grad_norm": 0.2387536730727916, "learning_rate": 5e-06, "loss": 0.5948, "step": 290 }, { "epoch": 0.880410858400587, "grad_norm": 0.2681109875779905, "learning_rate": 5e-06, "loss": 0.6009, "step": 300 }, { "epoch": 0.9097578870139399, "grad_norm": 0.23828059578929434, "learning_rate": 5e-06, "loss": 0.6047, "step": 310 }, { "epoch": 0.9391049156272927, "grad_norm": 0.28064198511079097, "learning_rate": 5e-06, "loss": 0.6011, "step": 320 }, { "epoch": 0.9684519442406456, "grad_norm": 0.2520680807814555, "learning_rate": 5e-06, "loss": 0.6065, "step": 330 }, { "epoch": 0.9977989728539985, "grad_norm": 0.23763302761129573, "learning_rate": 5e-06, "loss": 0.6004, "step": 340 }, { "epoch": 0.9977989728539985, "eval_loss": 0.6064473986625671, "eval_runtime": 348.1424, "eval_samples_per_second": 26.369, "eval_steps_per_second": 0.414, "step": 340 }, { "epoch": 1.028613352898019, "grad_norm": 0.2642965633252716, "learning_rate": 5e-06, "loss": 0.6221, "step": 350 }, { "epoch": 1.057960381511372, "grad_norm": 0.2511624129850785, "learning_rate": 5e-06, "loss": 0.5821, "step": 360 }, { "epoch": 1.0873074101247249, "grad_norm": 0.2756630079029959, "learning_rate": 5e-06, "loss": 0.5748, "step": 370 }, { "epoch": 1.1166544387380777, "grad_norm": 0.2728604387258577, "learning_rate": 5e-06, "loss": 0.571, "step": 380 }, { "epoch": 1.1460014673514307, "grad_norm": 0.2582013869017799, "learning_rate": 5e-06, "loss": 0.567, "step": 390 }, { "epoch": 1.1753484959647835, "grad_norm": 0.2480611253625622, "learning_rate": 5e-06, "loss": 0.573, "step": 400 }, { "epoch": 1.2046955245781366, "grad_norm": 0.2521929121222439, "learning_rate": 5e-06, "loss": 0.5762, "step": 410 }, { "epoch": 1.2340425531914894, "grad_norm": 0.2814284594714186, "learning_rate": 5e-06, "loss": 0.568, "step": 420 }, { "epoch": 1.2633895818048422, "grad_norm": 0.25575166795056736, "learning_rate": 5e-06, "loss": 0.5686, "step": 430 }, { "epoch": 1.2927366104181952, "grad_norm": 0.24195651370916318, "learning_rate": 5e-06, "loss": 0.5727, "step": 440 }, { "epoch": 1.322083639031548, "grad_norm": 0.23896798124266624, "learning_rate": 5e-06, "loss": 0.5726, "step": 450 }, { "epoch": 1.3514306676449008, "grad_norm": 0.23438597303418254, "learning_rate": 5e-06, "loss": 0.5662, "step": 460 }, { "epoch": 1.3807776962582539, "grad_norm": 0.23903191209794197, "learning_rate": 5e-06, "loss": 0.57, "step": 470 }, { "epoch": 1.4101247248716067, "grad_norm": 0.24760836378868828, "learning_rate": 5e-06, "loss": 0.5664, "step": 480 }, { "epoch": 1.4394717534849597, "grad_norm": 0.24447383016020638, "learning_rate": 5e-06, "loss": 0.5669, "step": 490 }, { "epoch": 1.4688187820983125, "grad_norm": 0.23217070641294132, "learning_rate": 5e-06, "loss": 0.5651, "step": 500 }, { "epoch": 1.4981658107116655, "grad_norm": 0.25531143471265705, "learning_rate": 5e-06, "loss": 0.5647, "step": 510 }, { "epoch": 1.5275128393250184, "grad_norm": 0.2550776733367678, "learning_rate": 5e-06, "loss": 0.5649, "step": 520 }, { "epoch": 1.5568598679383712, "grad_norm": 0.24221433795161712, "learning_rate": 5e-06, "loss": 0.5599, "step": 530 }, { "epoch": 1.5862068965517242, "grad_norm": 0.24677124272392145, "learning_rate": 5e-06, "loss": 0.5639, "step": 540 }, { "epoch": 1.615553925165077, "grad_norm": 0.241759687889569, "learning_rate": 5e-06, "loss": 0.5712, "step": 550 }, { "epoch": 1.6449009537784298, "grad_norm": 0.26497413976662465, "learning_rate": 5e-06, "loss": 0.5662, "step": 560 }, { "epoch": 1.6742479823917829, "grad_norm": 0.24480424837386783, "learning_rate": 5e-06, "loss": 0.569, "step": 570 }, { "epoch": 1.7035950110051359, "grad_norm": 0.23220187500115563, "learning_rate": 5e-06, "loss": 0.56, "step": 580 }, { "epoch": 1.7329420396184885, "grad_norm": 0.2455262935747061, "learning_rate": 5e-06, "loss": 0.5591, "step": 590 }, { "epoch": 1.7622890682318415, "grad_norm": 0.2496512190757695, "learning_rate": 5e-06, "loss": 0.5604, "step": 600 }, { "epoch": 1.7916360968451945, "grad_norm": 0.27689699650049476, "learning_rate": 5e-06, "loss": 0.5593, "step": 610 }, { "epoch": 1.8209831254585473, "grad_norm": 0.24775088458978922, "learning_rate": 5e-06, "loss": 0.5645, "step": 620 }, { "epoch": 1.8503301540719002, "grad_norm": 0.24978668558925504, "learning_rate": 5e-06, "loss": 0.5623, "step": 630 }, { "epoch": 1.8796771826852532, "grad_norm": 0.241089177891685, "learning_rate": 5e-06, "loss": 0.5582, "step": 640 }, { "epoch": 1.909024211298606, "grad_norm": 0.2518369233494824, "learning_rate": 5e-06, "loss": 0.564, "step": 650 }, { "epoch": 1.9383712399119588, "grad_norm": 0.25451686197096457, "learning_rate": 5e-06, "loss": 0.5701, "step": 660 }, { "epoch": 1.9677182685253118, "grad_norm": 0.2612349932502818, "learning_rate": 5e-06, "loss": 0.566, "step": 670 }, { "epoch": 1.9970652971386649, "grad_norm": 0.2530446066915159, "learning_rate": 5e-06, "loss": 0.5587, "step": 680 }, { "epoch": 1.9970652971386649, "eval_loss": 0.5877389907836914, "eval_runtime": 346.3038, "eval_samples_per_second": 26.509, "eval_steps_per_second": 0.416, "step": 680 }, { "epoch": 2.0278796771826855, "grad_norm": 0.25962780717757183, "learning_rate": 5e-06, "loss": 0.578, "step": 690 }, { "epoch": 2.057226705796038, "grad_norm": 0.27546943407418556, "learning_rate": 5e-06, "loss": 0.5283, "step": 700 }, { "epoch": 2.086573734409391, "grad_norm": 0.25584185592727543, "learning_rate": 5e-06, "loss": 0.531, "step": 710 }, { "epoch": 2.115920763022744, "grad_norm": 0.24576591948298593, "learning_rate": 5e-06, "loss": 0.5334, "step": 720 }, { "epoch": 2.1452677916360967, "grad_norm": 0.2562225828290363, "learning_rate": 5e-06, "loss": 0.5297, "step": 730 }, { "epoch": 2.1746148202494497, "grad_norm": 0.2539883488886924, "learning_rate": 5e-06, "loss": 0.5337, "step": 740 }, { "epoch": 2.2039618488628028, "grad_norm": 0.26631200434900615, "learning_rate": 5e-06, "loss": 0.5312, "step": 750 }, { "epoch": 2.2333088774761554, "grad_norm": 0.2577078388681737, "learning_rate": 5e-06, "loss": 0.5356, "step": 760 }, { "epoch": 2.2626559060895084, "grad_norm": 0.26194326400000306, "learning_rate": 5e-06, "loss": 0.5306, "step": 770 }, { "epoch": 2.2920029347028614, "grad_norm": 0.2630790831341597, "learning_rate": 5e-06, "loss": 0.529, "step": 780 }, { "epoch": 2.321349963316214, "grad_norm": 0.25077231022672064, "learning_rate": 5e-06, "loss": 0.5269, "step": 790 }, { "epoch": 2.350696991929567, "grad_norm": 0.25733729482296797, "learning_rate": 5e-06, "loss": 0.5293, "step": 800 }, { "epoch": 2.38004402054292, "grad_norm": 0.2866375135699729, "learning_rate": 5e-06, "loss": 0.5327, "step": 810 }, { "epoch": 2.409391049156273, "grad_norm": 0.2824137068468073, "learning_rate": 5e-06, "loss": 0.5355, "step": 820 }, { "epoch": 2.4387380777696257, "grad_norm": 0.31690935779225915, "learning_rate": 5e-06, "loss": 0.5293, "step": 830 }, { "epoch": 2.4680851063829787, "grad_norm": 0.2917856087014453, "learning_rate": 5e-06, "loss": 0.5352, "step": 840 }, { "epoch": 2.4974321349963318, "grad_norm": 0.279178870034809, "learning_rate": 5e-06, "loss": 0.5291, "step": 850 }, { "epoch": 2.5267791636096844, "grad_norm": 0.2562478404147616, "learning_rate": 5e-06, "loss": 0.5354, "step": 860 }, { "epoch": 2.5561261922230374, "grad_norm": 0.2656824229381719, "learning_rate": 5e-06, "loss": 0.5212, "step": 870 }, { "epoch": 2.5854732208363904, "grad_norm": 0.25736097875461483, "learning_rate": 5e-06, "loss": 0.5347, "step": 880 }, { "epoch": 2.6148202494497435, "grad_norm": 0.24456464436179887, "learning_rate": 5e-06, "loss": 0.5349, "step": 890 }, { "epoch": 2.644167278063096, "grad_norm": 0.27105999586294116, "learning_rate": 5e-06, "loss": 0.5234, "step": 900 }, { "epoch": 2.673514306676449, "grad_norm": 0.24761038402720206, "learning_rate": 5e-06, "loss": 0.5309, "step": 910 }, { "epoch": 2.7028613352898017, "grad_norm": 0.2501742858079035, "learning_rate": 5e-06, "loss": 0.5293, "step": 920 }, { "epoch": 2.7322083639031547, "grad_norm": 0.2566389597955405, "learning_rate": 5e-06, "loss": 0.5267, "step": 930 }, { "epoch": 2.7615553925165077, "grad_norm": 0.23782682320479148, "learning_rate": 5e-06, "loss": 0.5284, "step": 940 }, { "epoch": 2.7909024211298608, "grad_norm": 0.25945577387081686, "learning_rate": 5e-06, "loss": 0.5341, "step": 950 }, { "epoch": 2.8202494497432133, "grad_norm": 0.24530513842418022, "learning_rate": 5e-06, "loss": 0.5323, "step": 960 }, { "epoch": 2.8495964783565664, "grad_norm": 0.2565886786240202, "learning_rate": 5e-06, "loss": 0.5333, "step": 970 }, { "epoch": 2.8789435069699194, "grad_norm": 0.2665669048067188, "learning_rate": 5e-06, "loss": 0.5221, "step": 980 }, { "epoch": 2.908290535583272, "grad_norm": 0.2541129512298126, "learning_rate": 5e-06, "loss": 0.5283, "step": 990 }, { "epoch": 2.937637564196625, "grad_norm": 0.23723367595765862, "learning_rate": 5e-06, "loss": 0.5357, "step": 1000 }, { "epoch": 2.966984592809978, "grad_norm": 0.26430024675231284, "learning_rate": 5e-06, "loss": 0.5289, "step": 1010 }, { "epoch": 2.996331621423331, "grad_norm": 0.2483828307708907, "learning_rate": 5e-06, "loss": 0.528, "step": 1020 }, { "epoch": 2.996331621423331, "eval_loss": 0.5799095034599304, "eval_runtime": 348.2665, "eval_samples_per_second": 26.359, "eval_steps_per_second": 0.413, "step": 1020 }, { "epoch": 2.996331621423331, "step": 1020, "total_flos": 1708328610693120.0, "train_loss": 0.5780990016226675, "train_runtime": 57059.3633, "train_samples_per_second": 9.17, "train_steps_per_second": 0.018 } ], "logging_steps": 10, "max_steps": 1020, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1708328610693120.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }