{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.015958507879513265, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015958507879513265, "eval_loss": 2.050989866256714, "eval_runtime": 454.429, "eval_samples_per_second": 23.225, "eval_steps_per_second": 2.905, "step": 1 }, { "epoch": 0.00047875523638539794, "grad_norm": 0.9697971343994141, "learning_rate": 1.5e-05, "loss": 1.9725, "step": 3 }, { "epoch": 0.0009575104727707959, "grad_norm": 0.901890754699707, "learning_rate": 3e-05, "loss": 2.0065, "step": 6 }, { "epoch": 0.001436265709156194, "grad_norm": 0.9856873154640198, "learning_rate": 4.5e-05, "loss": 1.931, "step": 9 }, { "epoch": 0.001436265709156194, "eval_loss": 1.9257014989852905, "eval_runtime": 457.2135, "eval_samples_per_second": 23.083, "eval_steps_per_second": 2.887, "step": 9 }, { "epoch": 0.0019150209455415918, "grad_norm": 0.7096590399742126, "learning_rate": 4.993910125649561e-05, "loss": 1.7845, "step": 12 }, { "epoch": 0.0023937761819269898, "grad_norm": 0.726519763469696, "learning_rate": 4.962019382530521e-05, "loss": 1.8313, "step": 15 }, { "epoch": 0.002872531418312388, "grad_norm": 0.6210381388664246, "learning_rate": 4.9031542398457974e-05, "loss": 1.8015, "step": 18 }, { "epoch": 0.002872531418312388, "eval_loss": 1.7303786277770996, "eval_runtime": 456.9299, "eval_samples_per_second": 23.098, "eval_steps_per_second": 2.889, "step": 18 }, { "epoch": 0.0033512866546977858, "grad_norm": 0.6193668246269226, "learning_rate": 4.817959636416969e-05, "loss": 1.7996, "step": 21 }, { "epoch": 0.0038300418910831835, "grad_norm": 0.6268954873085022, "learning_rate": 4.707368982147318e-05, "loss": 1.6603, "step": 24 }, { "epoch": 0.004308797127468581, "grad_norm": 0.5985816121101379, "learning_rate": 4.572593931387604e-05, "loss": 1.716, "step": 27 }, { "epoch": 0.004308797127468581, "eval_loss": 1.707382321357727, "eval_runtime": 456.9479, "eval_samples_per_second": 23.097, "eval_steps_per_second": 2.889, "step": 27 }, { "epoch": 0.0047875523638539795, "grad_norm": 0.5452908873558044, "learning_rate": 4.415111107797445e-05, "loss": 1.7306, "step": 30 }, { "epoch": 0.005266307600239378, "grad_norm": 0.5148115754127502, "learning_rate": 4.2366459261474933e-05, "loss": 1.658, "step": 33 }, { "epoch": 0.005745062836624776, "grad_norm": 0.5095010995864868, "learning_rate": 4.039153688314145e-05, "loss": 1.7653, "step": 36 }, { "epoch": 0.005745062836624776, "eval_loss": 1.6911619901657104, "eval_runtime": 457.3928, "eval_samples_per_second": 23.074, "eval_steps_per_second": 2.886, "step": 36 }, { "epoch": 0.006223818073010173, "grad_norm": 0.480747789144516, "learning_rate": 3.824798160583012e-05, "loss": 1.6363, "step": 39 }, { "epoch": 0.0067025733093955715, "grad_norm": 0.4569284915924072, "learning_rate": 3.5959278669726935e-05, "loss": 1.6942, "step": 42 }, { "epoch": 0.00718132854578097, "grad_norm": 0.5147221684455872, "learning_rate": 3.355050358314172e-05, "loss": 1.7732, "step": 45 }, { "epoch": 0.00718132854578097, "eval_loss": 1.6826040744781494, "eval_runtime": 457.1352, "eval_samples_per_second": 23.087, "eval_steps_per_second": 2.888, "step": 45 }, { "epoch": 0.007660083782166367, "grad_norm": 0.46845507621765137, "learning_rate": 3.104804738999169e-05, "loss": 1.6482, "step": 48 }, { "epoch": 0.008138839018551766, "grad_norm": 0.5122058391571045, "learning_rate": 2.8479327524001636e-05, "loss": 1.6855, "step": 51 }, { "epoch": 0.008617594254937163, "grad_norm": 0.46340513229370117, "learning_rate": 2.587248741756253e-05, "loss": 1.5771, "step": 54 }, { "epoch": 0.008617594254937163, "eval_loss": 1.678298830986023, "eval_runtime": 457.7537, "eval_samples_per_second": 23.056, "eval_steps_per_second": 2.884, "step": 54 }, { "epoch": 0.00909634949132256, "grad_norm": 0.4864901602268219, "learning_rate": 2.3256088156396868e-05, "loss": 1.6974, "step": 57 }, { "epoch": 0.009575104727707959, "grad_norm": 0.47456270456314087, "learning_rate": 2.0658795558326743e-05, "loss": 1.7266, "step": 60 }, { "epoch": 0.010053859964093357, "grad_norm": 0.44960471987724304, "learning_rate": 1.8109066104575023e-05, "loss": 1.6703, "step": 63 }, { "epoch": 0.010053859964093357, "eval_loss": 1.6748311519622803, "eval_runtime": 457.1158, "eval_samples_per_second": 23.088, "eval_steps_per_second": 2.888, "step": 63 }, { "epoch": 0.010532615200478756, "grad_norm": 0.5045154690742493, "learning_rate": 1.56348351646022e-05, "loss": 1.6842, "step": 66 }, { "epoch": 0.011011370436864154, "grad_norm": 0.49012699723243713, "learning_rate": 1.3263210930352737e-05, "loss": 1.6247, "step": 69 }, { "epoch": 0.011490125673249552, "grad_norm": 0.5065352320671082, "learning_rate": 1.1020177413231334e-05, "loss": 1.7605, "step": 72 }, { "epoch": 0.011490125673249552, "eval_loss": 1.6728583574295044, "eval_runtime": 457.0738, "eval_samples_per_second": 23.09, "eval_steps_per_second": 2.888, "step": 72 }, { "epoch": 0.011968880909634948, "grad_norm": 0.5055304169654846, "learning_rate": 8.930309757836517e-06, "loss": 1.5792, "step": 75 }, { "epoch": 0.012447636146020347, "grad_norm": 0.46882903575897217, "learning_rate": 7.016504991533726e-06, "loss": 1.714, "step": 78 }, { "epoch": 0.012926391382405745, "grad_norm": 0.5146296620368958, "learning_rate": 5.299731159831953e-06, "loss": 1.5839, "step": 81 }, { "epoch": 0.012926391382405745, "eval_loss": 1.6719353199005127, "eval_runtime": 456.9575, "eval_samples_per_second": 23.096, "eval_steps_per_second": 2.889, "step": 81 }, { "epoch": 0.013405146618791143, "grad_norm": 0.4731264114379883, "learning_rate": 3.798797596089351e-06, "loss": 1.7055, "step": 84 }, { "epoch": 0.013883901855176541, "grad_norm": 0.5054563879966736, "learning_rate": 2.5301488425208296e-06, "loss": 1.5845, "step": 87 }, { "epoch": 0.01436265709156194, "grad_norm": 0.46672162413597107, "learning_rate": 1.5076844803522922e-06, "loss": 1.7207, "step": 90 }, { "epoch": 0.01436265709156194, "eval_loss": 1.671474575996399, "eval_runtime": 456.9871, "eval_samples_per_second": 23.095, "eval_steps_per_second": 2.888, "step": 90 }, { "epoch": 0.014841412327947338, "grad_norm": 0.5002412796020508, "learning_rate": 7.426068431000882e-07, "loss": 1.6392, "step": 93 }, { "epoch": 0.015320167564332734, "grad_norm": 0.47195887565612793, "learning_rate": 2.4329828146074095e-07, "loss": 1.6559, "step": 96 }, { "epoch": 0.015798922800718134, "grad_norm": 0.513578474521637, "learning_rate": 1.522932452260595e-08, "loss": 1.7077, "step": 99 }, { "epoch": 0.015798922800718134, "eval_loss": 1.6713879108428955, "eval_runtime": 457.4203, "eval_samples_per_second": 23.073, "eval_steps_per_second": 2.886, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.75001861668864e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }