{ "best_metric": 1.3268142938613892, "best_model_checkpoint": "miner_id_24/checkpoint-150", "epoch": 0.5586592178770949, "eval_steps": 50, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00186219739292365, "eval_loss": 3.168727159500122, "eval_runtime": 7.0702, "eval_samples_per_second": 67.467, "eval_steps_per_second": 8.486, "step": 1 }, { "epoch": 0.0186219739292365, "grad_norm": 12.249234199523926, "learning_rate": 5.970149253731343e-06, "loss": 3.2555, "step": 10 }, { "epoch": 0.037243947858473, "grad_norm": 8.033016204833984, "learning_rate": 1.1940298507462686e-05, "loss": 2.6629, "step": 20 }, { "epoch": 0.055865921787709494, "grad_norm": 9.513952255249023, "learning_rate": 1.791044776119403e-05, "loss": 1.9234, "step": 30 }, { "epoch": 0.074487895716946, "grad_norm": 9.283692359924316, "learning_rate": 2.3880597014925373e-05, "loss": 1.6316, "step": 40 }, { "epoch": 0.0931098696461825, "grad_norm": 10.807787895202637, "learning_rate": 2.9850746268656714e-05, "loss": 1.4763, "step": 50 }, { "epoch": 0.0931098696461825, "eval_loss": 1.4449926614761353, "eval_runtime": 6.9569, "eval_samples_per_second": 68.565, "eval_steps_per_second": 8.624, "step": 50 }, { "epoch": 0.11173184357541899, "grad_norm": 7.413595199584961, "learning_rate": 3.582089552238806e-05, "loss": 1.3803, "step": 60 }, { "epoch": 0.1303538175046555, "grad_norm": 4.4787139892578125, "learning_rate": 4.1791044776119404e-05, "loss": 1.4074, "step": 70 }, { "epoch": 0.148975791433892, "grad_norm": 5.90806770324707, "learning_rate": 4.7761194029850745e-05, "loss": 1.5316, "step": 80 }, { "epoch": 0.16759776536312848, "grad_norm": 4.434422969818115, "learning_rate": 5.373134328358209e-05, "loss": 1.3362, "step": 90 }, { "epoch": 0.186219739292365, "grad_norm": 6.3269195556640625, "learning_rate": 5.970149253731343e-05, "loss": 1.3149, "step": 100 }, { "epoch": 0.186219739292365, "eval_loss": 1.3312469720840454, "eval_runtime": 6.9255, "eval_samples_per_second": 68.876, "eval_steps_per_second": 8.664, "step": 100 }, { "epoch": 0.2048417132216015, "grad_norm": 5.652113437652588, "learning_rate": 6.567164179104478e-05, "loss": 1.2403, "step": 110 }, { "epoch": 0.22346368715083798, "grad_norm": 5.22323751449585, "learning_rate": 7.164179104477612e-05, "loss": 1.3804, "step": 120 }, { "epoch": 0.24208566108007448, "grad_norm": 5.073678493499756, "learning_rate": 7.761194029850747e-05, "loss": 1.5211, "step": 130 }, { "epoch": 0.260707635009311, "grad_norm": 4.01777982711792, "learning_rate": 8.358208955223881e-05, "loss": 1.3246, "step": 140 }, { "epoch": 0.27932960893854747, "grad_norm": 8.206710815429688, "learning_rate": 8.955223880597016e-05, "loss": 1.3618, "step": 150 }, { "epoch": 0.27932960893854747, "eval_loss": 1.3268142938613892, "eval_runtime": 7.022, "eval_samples_per_second": 67.93, "eval_steps_per_second": 8.545, "step": 150 }, { "epoch": 0.297951582867784, "grad_norm": 5.138635158538818, "learning_rate": 9.552238805970149e-05, "loss": 1.3627, "step": 160 }, { "epoch": 0.3165735567970205, "grad_norm": 3.9790635108947754, "learning_rate": 0.00010149253731343284, "loss": 1.3778, "step": 170 }, { "epoch": 0.33519553072625696, "grad_norm": 4.547658920288086, "learning_rate": 0.00010746268656716419, "loss": 1.4092, "step": 180 }, { "epoch": 0.3538175046554935, "grad_norm": 3.9823379516601562, "learning_rate": 0.00011343283582089552, "loss": 1.3419, "step": 190 }, { "epoch": 0.37243947858473, "grad_norm": 5.346208572387695, "learning_rate": 0.00011940298507462686, "loss": 1.396, "step": 200 }, { "epoch": 0.37243947858473, "eval_loss": 1.3320053815841675, "eval_runtime": 7.0466, "eval_samples_per_second": 67.692, "eval_steps_per_second": 8.515, "step": 200 }, { "epoch": 0.39106145251396646, "grad_norm": 3.4127047061920166, "learning_rate": 0.00012537313432835822, "loss": 1.3778, "step": 210 }, { "epoch": 0.409683426443203, "grad_norm": 4.872797012329102, "learning_rate": 0.00013134328358208955, "loss": 1.3956, "step": 220 }, { "epoch": 0.42830540037243947, "grad_norm": 3.7931699752807617, "learning_rate": 0.0001373134328358209, "loss": 1.3185, "step": 230 }, { "epoch": 0.44692737430167595, "grad_norm": 4.517390727996826, "learning_rate": 0.00014328358208955225, "loss": 1.3737, "step": 240 }, { "epoch": 0.4655493482309125, "grad_norm": 5.615145206451416, "learning_rate": 0.0001492537313432836, "loss": 1.4143, "step": 250 }, { "epoch": 0.4655493482309125, "eval_loss": 1.3690265417099, "eval_runtime": 7.1427, "eval_samples_per_second": 66.781, "eval_steps_per_second": 8.4, "step": 250 }, { "epoch": 0.48417132216014896, "grad_norm": 3.249690055847168, "learning_rate": 0.00015522388059701495, "loss": 1.4629, "step": 260 }, { "epoch": 0.5027932960893855, "grad_norm": 3.2629432678222656, "learning_rate": 0.00016119402985074628, "loss": 1.4387, "step": 270 }, { "epoch": 0.521415270018622, "grad_norm": 4.808574676513672, "learning_rate": 0.00016716417910447761, "loss": 1.4205, "step": 280 }, { "epoch": 0.5400372439478585, "grad_norm": 3.8548362255096436, "learning_rate": 0.00017313432835820895, "loss": 1.4286, "step": 290 }, { "epoch": 0.5586592178770949, "grad_norm": 5.292648792266846, "learning_rate": 0.0001791044776119403, "loss": 1.3715, "step": 300 }, { "epoch": 0.5586592178770949, "eval_loss": 1.346940517425537, "eval_runtime": 7.1626, "eval_samples_per_second": 66.596, "eval_steps_per_second": 8.377, "step": 300 } ], "logging_steps": 10, "max_steps": 53700, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.351332862610637e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }