|
{ |
|
"best_metric": 1.3539438247680664, |
|
"best_model_checkpoint": "longt5_xl_gov_bp_20/checkpoint-34", |
|
"epoch": 4.983967017865323, |
|
"eval_steps": 500, |
|
"global_step": 170, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.001, |
|
"loss": 1.0271, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.001, |
|
"loss": 1.2091, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.001, |
|
"loss": 0.8029, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 0.001, |
|
"loss": 0.8061, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 0.001, |
|
"loss": 0.8036, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 0.001, |
|
"loss": 0.7449, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 0.001, |
|
"loss": 0.776, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 0.001, |
|
"loss": 0.8082, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 0.001, |
|
"loss": 0.8653, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 0.001, |
|
"loss": 0.7104, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 0.001, |
|
"loss": 0.8144, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 0.001, |
|
"loss": 0.7039, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 0.001, |
|
"loss": 0.6056, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 0.001, |
|
"loss": 0.6466, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 0.001, |
|
"loss": 0.6258, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 0.001, |
|
"loss": 0.7159, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 0.001, |
|
"loss": 0.6879, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.3539438247680664, |
|
"eval_runtime": 343.3833, |
|
"eval_samples_per_second": 2.831, |
|
"eval_steps_per_second": 0.355, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"learning_rate": 0.001, |
|
"loss": 0.591, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"learning_rate": 0.001, |
|
"loss": 0.5727, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"learning_rate": 0.001, |
|
"loss": 0.5399, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"learning_rate": 0.001, |
|
"loss": 0.5328, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"learning_rate": 0.001, |
|
"loss": 0.5348, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"learning_rate": 0.001, |
|
"loss": 0.5264, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"learning_rate": 0.001, |
|
"loss": 0.5156, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"learning_rate": 0.001, |
|
"loss": 0.5209, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"learning_rate": 0.001, |
|
"loss": 0.5102, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"learning_rate": 0.001, |
|
"loss": 0.4935, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"learning_rate": 0.001, |
|
"loss": 0.5006, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"learning_rate": 0.001, |
|
"loss": 0.5027, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"learning_rate": 0.001, |
|
"loss": 0.4907, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"learning_rate": 0.001, |
|
"loss": 0.5059, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"learning_rate": 0.001, |
|
"loss": 0.5053, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"learning_rate": 0.001, |
|
"loss": 0.4862, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"learning_rate": 0.001, |
|
"loss": 0.4865, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"eval_loss": 1.3666499853134155, |
|
"eval_runtime": 344.5818, |
|
"eval_samples_per_second": 2.821, |
|
"eval_steps_per_second": 0.354, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"learning_rate": 0.001, |
|
"loss": 0.4426, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"learning_rate": 0.001, |
|
"loss": 0.4408, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"learning_rate": 0.001, |
|
"loss": 0.4258, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"learning_rate": 0.001, |
|
"loss": 0.4243, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"learning_rate": 0.001, |
|
"loss": 0.4235, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"learning_rate": 0.001, |
|
"loss": 0.4426, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"learning_rate": 0.001, |
|
"loss": 0.4366, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"learning_rate": 0.001, |
|
"loss": 0.4253, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"learning_rate": 0.001, |
|
"loss": 0.4443, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"learning_rate": 0.001, |
|
"loss": 0.4301, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"learning_rate": 0.001, |
|
"loss": 0.4368, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"learning_rate": 0.001, |
|
"loss": 0.4403, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"learning_rate": 0.001, |
|
"loss": 0.3899, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"learning_rate": 0.001, |
|
"loss": 0.406, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"learning_rate": 0.001, |
|
"loss": 0.4295, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"learning_rate": 0.001, |
|
"loss": 0.4463, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"learning_rate": 0.001, |
|
"loss": 0.436, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"eval_loss": 1.5058249235153198, |
|
"eval_runtime": 343.8598, |
|
"eval_samples_per_second": 2.827, |
|
"eval_steps_per_second": 0.355, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"learning_rate": 0.001, |
|
"loss": 0.3902, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"learning_rate": 0.001, |
|
"loss": 0.3745, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"learning_rate": 0.001, |
|
"loss": 0.375, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"learning_rate": 0.001, |
|
"loss": 0.382, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"learning_rate": 0.001, |
|
"loss": 0.3798, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"learning_rate": 0.001, |
|
"loss": 0.3777, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"learning_rate": 0.001, |
|
"loss": 0.3884, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"learning_rate": 0.001, |
|
"loss": 0.3915, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"learning_rate": 0.001, |
|
"loss": 0.3898, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"learning_rate": 0.001, |
|
"loss": 0.3833, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"learning_rate": 0.001, |
|
"loss": 0.3831, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"learning_rate": 0.001, |
|
"loss": 0.3778, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"learning_rate": 0.001, |
|
"loss": 0.3774, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"learning_rate": 0.001, |
|
"loss": 0.3883, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"learning_rate": 0.001, |
|
"loss": 0.3943, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"learning_rate": 0.001, |
|
"loss": 0.395, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"learning_rate": 0.001, |
|
"loss": 0.3893, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"eval_loss": 1.4799894094467163, |
|
"eval_runtime": 343.1889, |
|
"eval_samples_per_second": 2.832, |
|
"eval_steps_per_second": 0.355, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"learning_rate": 0.001, |
|
"loss": 0.3537, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"learning_rate": 0.001, |
|
"loss": 0.3376, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"learning_rate": 0.001, |
|
"loss": 0.334, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"learning_rate": 0.001, |
|
"loss": 0.3429, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"learning_rate": 0.001, |
|
"loss": 0.3464, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"learning_rate": 0.001, |
|
"loss": 0.3483, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"learning_rate": 0.001, |
|
"loss": 0.3407, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"learning_rate": 0.001, |
|
"loss": 0.3501, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"learning_rate": 0.001, |
|
"loss": 0.3409, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"learning_rate": 0.001, |
|
"loss": 0.3493, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"learning_rate": 0.001, |
|
"loss": 0.3402, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"learning_rate": 0.001, |
|
"loss": 0.3446, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"learning_rate": 0.001, |
|
"loss": 0.3483, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"learning_rate": 0.001, |
|
"loss": 0.3511, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"learning_rate": 0.001, |
|
"loss": 0.3515, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"learning_rate": 0.001, |
|
"loss": 0.3471, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"learning_rate": 0.001, |
|
"loss": 0.3517, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"eval_loss": 1.5910658836364746, |
|
"eval_runtime": 343.3017, |
|
"eval_samples_per_second": 2.831, |
|
"eval_steps_per_second": 0.355, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"step": 170, |
|
"total_flos": 5.953337981380067e+18, |
|
"train_loss": 0.493012877071605, |
|
"train_runtime": 118230.4451, |
|
"train_samples_per_second": 0.738, |
|
"train_steps_per_second": 0.001 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 170, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 5.953337981380067e+18, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|