{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03690675278242316, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003690675278242316, "eval_loss": 0.9037973880767822, "eval_runtime": 1016.6858, "eval_samples_per_second": 8.977, "eval_steps_per_second": 4.489, "step": 1 }, { "epoch": 0.0011072025834726947, "grad_norm": 0.7249619364738464, "learning_rate": 3e-05, "loss": 0.8873, "step": 3 }, { "epoch": 0.0022144051669453894, "grad_norm": 0.6593801975250244, "learning_rate": 6e-05, "loss": 0.8522, "step": 6 }, { "epoch": 0.0033216077504180843, "grad_norm": 0.630030632019043, "learning_rate": 9e-05, "loss": 0.6819, "step": 9 }, { "epoch": 0.0033216077504180843, "eval_loss": 0.5947146415710449, "eval_runtime": 1016.5015, "eval_samples_per_second": 8.979, "eval_steps_per_second": 4.49, "step": 9 }, { "epoch": 0.004428810333890779, "grad_norm": 0.4713630676269531, "learning_rate": 0.00012, "loss": 0.5746, "step": 12 }, { "epoch": 0.005536012917363474, "grad_norm": 0.3384400010108948, "learning_rate": 0.00015000000000000001, "loss": 0.511, "step": 15 }, { "epoch": 0.006643215500836169, "grad_norm": 0.2903579771518707, "learning_rate": 0.00018, "loss": 0.4973, "step": 18 }, { "epoch": 0.006643215500836169, "eval_loss": 0.4754287600517273, "eval_runtime": 1016.919, "eval_samples_per_second": 8.975, "eval_steps_per_second": 4.488, "step": 18 }, { "epoch": 0.007750418084308863, "grad_norm": 0.26274538040161133, "learning_rate": 0.0001999229036240723, "loss": 0.4692, "step": 21 }, { "epoch": 0.008857620667781558, "grad_norm": 0.24625541269779205, "learning_rate": 0.00019876883405951377, "loss": 0.4577, "step": 24 }, { "epoch": 0.009964823251254253, "grad_norm": 0.25311341881752014, "learning_rate": 0.00019624552364536473, "loss": 0.4304, "step": 27 }, { "epoch": 0.009964823251254253, "eval_loss": 0.4487799108028412, "eval_runtime": 1016.8066, "eval_samples_per_second": 8.976, "eval_steps_per_second": 4.489, "step": 27 }, { "epoch": 0.011072025834726948, "grad_norm": 0.23702092468738556, "learning_rate": 0.0001923879532511287, "loss": 0.446, "step": 30 }, { "epoch": 0.012179228418199642, "grad_norm": 0.23901499807834625, "learning_rate": 0.00018724960070727972, "loss": 0.4481, "step": 33 }, { "epoch": 0.013286431001672337, "grad_norm": 0.22688159346580505, "learning_rate": 0.00018090169943749476, "loss": 0.4413, "step": 36 }, { "epoch": 0.013286431001672337, "eval_loss": 0.435557097196579, "eval_runtime": 1017.2332, "eval_samples_per_second": 8.972, "eval_steps_per_second": 4.487, "step": 36 }, { "epoch": 0.014393633585145033, "grad_norm": 0.24019119143486023, "learning_rate": 0.00017343225094356855, "loss": 0.4348, "step": 39 }, { "epoch": 0.015500836168617726, "grad_norm": 0.2193296253681183, "learning_rate": 0.00016494480483301836, "loss": 0.4292, "step": 42 }, { "epoch": 0.016608038752090423, "grad_norm": 0.2635402977466583, "learning_rate": 0.00015555702330196023, "loss": 0.4426, "step": 45 }, { "epoch": 0.016608038752090423, "eval_loss": 0.42657363414764404, "eval_runtime": 1016.8284, "eval_samples_per_second": 8.976, "eval_steps_per_second": 4.488, "step": 45 }, { "epoch": 0.017715241335563115, "grad_norm": 0.22344684600830078, "learning_rate": 0.00014539904997395468, "loss": 0.4211, "step": 48 }, { "epoch": 0.01882244391903581, "grad_norm": 0.20363473892211914, "learning_rate": 0.0001346117057077493, "loss": 0.4162, "step": 51 }, { "epoch": 0.019929646502508506, "grad_norm": 0.2130439579486847, "learning_rate": 0.00012334453638559057, "loss": 0.4388, "step": 54 }, { "epoch": 0.019929646502508506, "eval_loss": 0.4211220443248749, "eval_runtime": 1016.8833, "eval_samples_per_second": 8.975, "eval_steps_per_second": 4.488, "step": 54 }, { "epoch": 0.0210368490859812, "grad_norm": 0.22026380896568298, "learning_rate": 0.00011175373974578378, "loss": 0.418, "step": 57 }, { "epoch": 0.022144051669453896, "grad_norm": 0.21481454372406006, "learning_rate": 0.0001, "loss": 0.4231, "step": 60 }, { "epoch": 0.023251254252926592, "grad_norm": 0.22130335867404938, "learning_rate": 8.824626025421626e-05, "loss": 0.4012, "step": 63 }, { "epoch": 0.023251254252926592, "eval_loss": 0.41657453775405884, "eval_runtime": 1016.9203, "eval_samples_per_second": 8.975, "eval_steps_per_second": 4.488, "step": 63 }, { "epoch": 0.024358456836399284, "grad_norm": 0.21064302325248718, "learning_rate": 7.66554636144095e-05, "loss": 0.4212, "step": 66 }, { "epoch": 0.02546565941987198, "grad_norm": 0.19731996953487396, "learning_rate": 6.538829429225069e-05, "loss": 0.4105, "step": 69 }, { "epoch": 0.026572862003344674, "grad_norm": 0.20379899442195892, "learning_rate": 5.4600950026045326e-05, "loss": 0.4188, "step": 72 }, { "epoch": 0.026572862003344674, "eval_loss": 0.413289874792099, "eval_runtime": 1017.5359, "eval_samples_per_second": 8.97, "eval_steps_per_second": 4.485, "step": 72 }, { "epoch": 0.02768006458681737, "grad_norm": 0.2074277549982071, "learning_rate": 4.444297669803981e-05, "loss": 0.4211, "step": 75 }, { "epoch": 0.028787267170290065, "grad_norm": 0.2055700421333313, "learning_rate": 3.5055195166981645e-05, "loss": 0.404, "step": 78 }, { "epoch": 0.02989446975376276, "grad_norm": 0.2100721150636673, "learning_rate": 2.6567749056431467e-05, "loss": 0.4367, "step": 81 }, { "epoch": 0.02989446975376276, "eval_loss": 0.4112100899219513, "eval_runtime": 1017.2852, "eval_samples_per_second": 8.972, "eval_steps_per_second": 4.486, "step": 81 }, { "epoch": 0.031001672337235452, "grad_norm": 0.20384648442268372, "learning_rate": 1.9098300562505266e-05, "loss": 0.414, "step": 84 }, { "epoch": 0.03210887492070815, "grad_norm": 0.2008475959300995, "learning_rate": 1.2750399292720283e-05, "loss": 0.406, "step": 87 }, { "epoch": 0.033216077504180846, "grad_norm": 0.21010953187942505, "learning_rate": 7.612046748871327e-06, "loss": 0.4213, "step": 90 }, { "epoch": 0.033216077504180846, "eval_loss": 0.41008633375167847, "eval_runtime": 1016.9534, "eval_samples_per_second": 8.975, "eval_steps_per_second": 4.488, "step": 90 }, { "epoch": 0.034323280087653535, "grad_norm": 0.2094004601240158, "learning_rate": 3.7544763546352834e-06, "loss": 0.4245, "step": 93 }, { "epoch": 0.03543048267112623, "grad_norm": 0.20318011939525604, "learning_rate": 1.231165940486234e-06, "loss": 0.4194, "step": 96 }, { "epoch": 0.036537685254598926, "grad_norm": 0.20332947373390198, "learning_rate": 7.709637592770991e-08, "loss": 0.4019, "step": 99 }, { "epoch": 0.036537685254598926, "eval_loss": 0.4097938537597656, "eval_runtime": 1018.18, "eval_samples_per_second": 8.964, "eval_steps_per_second": 4.483, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.968186696531968e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }