{ "best_metric": 1.0223731994628906, "best_model_checkpoint": "mgh6/TCS_MLM_All/checkpoint-700", "epoch": 10.0, "eval_steps": 100, "global_step": 1050, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.9523809523809523, "grad_norm": 0.18039299547672272, "learning_rate": 0.0009047619047619047, "loss": 1.5254, "step": 100 }, { "epoch": 0.9523809523809523, "eval_loss": 1.250386118888855, "eval_runtime": 0.218, "eval_samples_per_second": 87.168, "eval_steps_per_second": 4.588, "step": 100 }, { "epoch": 1.9047619047619047, "grad_norm": 0.17264288663864136, "learning_rate": 0.0008095238095238096, "loss": 1.3281, "step": 200 }, { "epoch": 1.9047619047619047, "eval_loss": 1.1765815019607544, "eval_runtime": 0.2181, "eval_samples_per_second": 87.116, "eval_steps_per_second": 4.585, "step": 200 }, { "epoch": 2.857142857142857, "grad_norm": 0.16163380444049835, "learning_rate": 0.0007142857142857143, "loss": 1.2854, "step": 300 }, { "epoch": 2.857142857142857, "eval_loss": 1.1724227666854858, "eval_runtime": 0.2192, "eval_samples_per_second": 86.662, "eval_steps_per_second": 4.561, "step": 300 }, { "epoch": 3.8095238095238093, "grad_norm": 0.17432411015033722, "learning_rate": 0.0006190476190476191, "loss": 1.2553, "step": 400 }, { "epoch": 3.8095238095238093, "eval_loss": 1.1775765419006348, "eval_runtime": 0.2186, "eval_samples_per_second": 86.925, "eval_steps_per_second": 4.575, "step": 400 }, { "epoch": 4.761904761904762, "grad_norm": 0.18052713572978973, "learning_rate": 0.0005238095238095238, "loss": 1.235, "step": 500 }, { "epoch": 4.761904761904762, "eval_loss": 1.0535513162612915, "eval_runtime": 0.2161, "eval_samples_per_second": 87.904, "eval_steps_per_second": 4.627, "step": 500 }, { "epoch": 5.714285714285714, "grad_norm": 0.1662505716085434, "learning_rate": 0.00042857142857142855, "loss": 1.2114, "step": 600 }, { "epoch": 5.714285714285714, "eval_loss": 1.035285234451294, "eval_runtime": 0.2191, "eval_samples_per_second": 86.713, "eval_steps_per_second": 4.564, "step": 600 }, { "epoch": 6.666666666666667, "grad_norm": 0.16807609796524048, "learning_rate": 0.0003333333333333333, "loss": 1.1944, "step": 700 }, { "epoch": 6.666666666666667, "eval_loss": 1.0223731994628906, "eval_runtime": 0.2174, "eval_samples_per_second": 87.385, "eval_steps_per_second": 4.599, "step": 700 }, { "epoch": 7.619047619047619, "grad_norm": 0.20244956016540527, "learning_rate": 0.0002380952380952381, "loss": 1.1797, "step": 800 }, { "epoch": 7.619047619047619, "eval_loss": 1.049731731414795, "eval_runtime": 0.2231, "eval_samples_per_second": 85.156, "eval_steps_per_second": 4.482, "step": 800 }, { "epoch": 8.571428571428571, "grad_norm": 0.182796448469162, "learning_rate": 0.00014285714285714284, "loss": 1.1605, "step": 900 }, { "epoch": 8.571428571428571, "eval_loss": 1.0446473360061646, "eval_runtime": 0.2185, "eval_samples_per_second": 86.964, "eval_steps_per_second": 4.577, "step": 900 }, { "epoch": 9.523809523809524, "grad_norm": 0.17410966753959656, "learning_rate": 4.761904761904762e-05, "loss": 1.1496, "step": 1000 }, { "epoch": 9.523809523809524, "eval_loss": 1.027449131011963, "eval_runtime": 0.217, "eval_samples_per_second": 87.54, "eval_steps_per_second": 4.607, "step": 1000 } ], "logging_steps": 100, "max_steps": 1050, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.078313589342208e+16, "train_batch_size": 64, "trial_name": null, "trial_params": null }