|
{ |
|
"best_metric": 6.808236122131348, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-200", |
|
"epoch": 0.08294453084499741, |
|
"eval_steps": 50, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00041472265422498703, |
|
"eval_loss": 10.22995376586914, |
|
"eval_runtime": 26.0494, |
|
"eval_samples_per_second": 155.896, |
|
"eval_steps_per_second": 39.003, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.001244167962674961, |
|
"grad_norm": 8.375945091247559, |
|
"learning_rate": 3e-05, |
|
"loss": 10.1464, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.002488335925349922, |
|
"grad_norm": 8.747570991516113, |
|
"learning_rate": 6e-05, |
|
"loss": 9.8523, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0037325038880248835, |
|
"grad_norm": 8.138151168823242, |
|
"learning_rate": 9e-05, |
|
"loss": 9.5672, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.004976671850699844, |
|
"grad_norm": 6.871833324432373, |
|
"learning_rate": 9.997266286704631e-05, |
|
"loss": 9.2243, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.006220839813374806, |
|
"grad_norm": 6.836878776550293, |
|
"learning_rate": 9.98292246503335e-05, |
|
"loss": 9.0827, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.007465007776049767, |
|
"grad_norm": 6.312921047210693, |
|
"learning_rate": 9.956320346634876e-05, |
|
"loss": 8.8257, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.008709175738724729, |
|
"grad_norm": 5.9714837074279785, |
|
"learning_rate": 9.917525374361912e-05, |
|
"loss": 8.521, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.009953343701399688, |
|
"grad_norm": 4.645086288452148, |
|
"learning_rate": 9.86663298624003e-05, |
|
"loss": 8.1377, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.01119751166407465, |
|
"grad_norm": 4.165164470672607, |
|
"learning_rate": 9.803768380684242e-05, |
|
"loss": 7.9853, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.012441679626749611, |
|
"grad_norm": 5.4236555099487305, |
|
"learning_rate": 9.729086208503174e-05, |
|
"loss": 7.9031, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.013685847589424573, |
|
"grad_norm": 4.232346057891846, |
|
"learning_rate": 9.642770192448536e-05, |
|
"loss": 7.7511, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.014930015552099534, |
|
"grad_norm": 3.674351215362549, |
|
"learning_rate": 9.545032675245813e-05, |
|
"loss": 7.4404, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.016174183514774496, |
|
"grad_norm": 4.119446277618408, |
|
"learning_rate": 9.43611409721806e-05, |
|
"loss": 7.2037, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.017418351477449457, |
|
"grad_norm": 4.06898307800293, |
|
"learning_rate": 9.316282404787871e-05, |
|
"loss": 7.3253, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.01866251944012442, |
|
"grad_norm": 4.10125207901001, |
|
"learning_rate": 9.185832391312644e-05, |
|
"loss": 7.2997, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.019906687402799376, |
|
"grad_norm": 4.548858642578125, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 7.4761, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.020736132711249352, |
|
"eval_loss": 7.311596870422363, |
|
"eval_runtime": 26.1105, |
|
"eval_samples_per_second": 155.531, |
|
"eval_steps_per_second": 38.912, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.021150855365474338, |
|
"grad_norm": 5.925373554229736, |
|
"learning_rate": 8.894386393810563e-05, |
|
"loss": 7.2564, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0223950233281493, |
|
"grad_norm": 5.66447114944458, |
|
"learning_rate": 8.73410738492077e-05, |
|
"loss": 7.397, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.02363919129082426, |
|
"grad_norm": 3.098438024520874, |
|
"learning_rate": 8.564642241456986e-05, |
|
"loss": 7.2778, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.024883359253499222, |
|
"grad_norm": 4.301871299743652, |
|
"learning_rate": 8.386407858128706e-05, |
|
"loss": 7.1488, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.026127527216174184, |
|
"grad_norm": 3.441345453262329, |
|
"learning_rate": 8.199842702516583e-05, |
|
"loss": 7.6062, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.027371695178849145, |
|
"grad_norm": 3.739030361175537, |
|
"learning_rate": 8.005405736415126e-05, |
|
"loss": 7.151, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.028615863141524107, |
|
"grad_norm": 3.039222478866577, |
|
"learning_rate": 7.803575286758364e-05, |
|
"loss": 7.0776, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.029860031104199068, |
|
"grad_norm": 3.4319775104522705, |
|
"learning_rate": 7.594847868906076e-05, |
|
"loss": 6.8775, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.03110419906687403, |
|
"grad_norm": 3.82979416847229, |
|
"learning_rate": 7.379736965185368e-05, |
|
"loss": 7.171, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.03234836702954899, |
|
"grad_norm": 3.212101697921753, |
|
"learning_rate": 7.158771761692464e-05, |
|
"loss": 7.223, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.03359253499222395, |
|
"grad_norm": 2.7414333820343018, |
|
"learning_rate": 6.932495846462261e-05, |
|
"loss": 6.8763, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.034836702954898914, |
|
"grad_norm": 3.106311082839966, |
|
"learning_rate": 6.701465872208216e-05, |
|
"loss": 7.24, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.03608087091757387, |
|
"grad_norm": 3.515357494354248, |
|
"learning_rate": 6.466250186922325e-05, |
|
"loss": 6.9001, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.03732503888024884, |
|
"grad_norm": 3.5490942001342773, |
|
"learning_rate": 6.227427435703997e-05, |
|
"loss": 6.8924, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.038569206842923795, |
|
"grad_norm": 3.3202061653137207, |
|
"learning_rate": 5.985585137257401e-05, |
|
"loss": 6.8308, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.03981337480559875, |
|
"grad_norm": 4.054980754852295, |
|
"learning_rate": 5.74131823855921e-05, |
|
"loss": 6.9539, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.04105754276827372, |
|
"grad_norm": 3.8746256828308105, |
|
"learning_rate": 5.495227651252315e-05, |
|
"loss": 6.7836, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.041472265422498704, |
|
"eval_loss": 6.94667911529541, |
|
"eval_runtime": 26.1302, |
|
"eval_samples_per_second": 155.414, |
|
"eval_steps_per_second": 38.882, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.042301710730948676, |
|
"grad_norm": 6.138733863830566, |
|
"learning_rate": 5.247918773366112e-05, |
|
"loss": 7.08, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.04354587869362364, |
|
"grad_norm": 3.955928087234497, |
|
"learning_rate": 5e-05, |
|
"loss": 7.2784, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0447900466562986, |
|
"grad_norm": 2.847581624984741, |
|
"learning_rate": 4.7520812266338885e-05, |
|
"loss": 7.0751, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.046034214618973564, |
|
"grad_norm": 3.7220466136932373, |
|
"learning_rate": 4.504772348747687e-05, |
|
"loss": 6.8847, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.04727838258164852, |
|
"grad_norm": 3.8197450637817383, |
|
"learning_rate": 4.2586817614407895e-05, |
|
"loss": 7.1759, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.04852255054432349, |
|
"grad_norm": 3.3311004638671875, |
|
"learning_rate": 4.0144148627425993e-05, |
|
"loss": 6.7715, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.049766718506998445, |
|
"grad_norm": 3.1715171337127686, |
|
"learning_rate": 3.772572564296005e-05, |
|
"loss": 7.196, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0510108864696734, |
|
"grad_norm": 3.3263697624206543, |
|
"learning_rate": 3.533749813077677e-05, |
|
"loss": 6.9274, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.05225505443234837, |
|
"grad_norm": 2.904386281967163, |
|
"learning_rate": 3.298534127791785e-05, |
|
"loss": 6.757, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.053499222395023326, |
|
"grad_norm": 3.113102436065674, |
|
"learning_rate": 3.0675041535377405e-05, |
|
"loss": 6.619, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.05474339035769829, |
|
"grad_norm": 3.046929359436035, |
|
"learning_rate": 2.8412282383075363e-05, |
|
"loss": 6.7691, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.05598755832037325, |
|
"grad_norm": 3.922736406326294, |
|
"learning_rate": 2.6202630348146324e-05, |
|
"loss": 6.6507, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.05723172628304821, |
|
"grad_norm": 3.2588577270507812, |
|
"learning_rate": 2.405152131093926e-05, |
|
"loss": 6.6191, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.05847589424572317, |
|
"grad_norm": 3.445479154586792, |
|
"learning_rate": 2.196424713241637e-05, |
|
"loss": 6.5481, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.059720062208398136, |
|
"grad_norm": 4.105072021484375, |
|
"learning_rate": 1.9945942635848748e-05, |
|
"loss": 6.6649, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.060964230171073094, |
|
"grad_norm": 4.807459831237793, |
|
"learning_rate": 1.800157297483417e-05, |
|
"loss": 6.343, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.06220839813374806, |
|
"grad_norm": 6.678823471069336, |
|
"learning_rate": 1.6135921418712956e-05, |
|
"loss": 6.5526, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06220839813374806, |
|
"eval_loss": 6.823774814605713, |
|
"eval_runtime": 26.1162, |
|
"eval_samples_per_second": 155.498, |
|
"eval_steps_per_second": 38.903, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06345256609642301, |
|
"grad_norm": 3.967849016189575, |
|
"learning_rate": 1.435357758543015e-05, |
|
"loss": 6.9811, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.06469673405909798, |
|
"grad_norm": 4.2464470863342285, |
|
"learning_rate": 1.2658926150792322e-05, |
|
"loss": 7.043, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.06594090202177294, |
|
"grad_norm": 4.486046314239502, |
|
"learning_rate": 1.1056136061894384e-05, |
|
"loss": 6.8117, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.0671850699844479, |
|
"grad_norm": 3.788048505783081, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 7.0248, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.06842923794712286, |
|
"grad_norm": 3.6193251609802246, |
|
"learning_rate": 8.141676086873572e-06, |
|
"loss": 6.9322, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.06967340590979783, |
|
"grad_norm": 3.7306830883026123, |
|
"learning_rate": 6.837175952121306e-06, |
|
"loss": 7.1483, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.07091757387247279, |
|
"grad_norm": 3.2535622119903564, |
|
"learning_rate": 5.6388590278194096e-06, |
|
"loss": 6.8575, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.07216174183514774, |
|
"grad_norm": 2.966529607772827, |
|
"learning_rate": 4.549673247541875e-06, |
|
"loss": 6.8145, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.0734059097978227, |
|
"grad_norm": 2.99233078956604, |
|
"learning_rate": 3.5722980755146517e-06, |
|
"loss": 6.9243, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.07465007776049767, |
|
"grad_norm": 3.4011435508728027, |
|
"learning_rate": 2.7091379149682685e-06, |
|
"loss": 6.7688, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07589424572317263, |
|
"grad_norm": 3.149674892425537, |
|
"learning_rate": 1.962316193157593e-06, |
|
"loss": 6.7597, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.07713841368584759, |
|
"grad_norm": 3.01259446144104, |
|
"learning_rate": 1.333670137599713e-06, |
|
"loss": 6.6439, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.07838258164852255, |
|
"grad_norm": 3.5751147270202637, |
|
"learning_rate": 8.247462563808817e-07, |
|
"loss": 6.6211, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.0796267496111975, |
|
"grad_norm": 3.9128057956695557, |
|
"learning_rate": 4.367965336512403e-07, |
|
"loss": 6.7342, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.08087091757387248, |
|
"grad_norm": 3.7299938201904297, |
|
"learning_rate": 1.7077534966650766e-07, |
|
"loss": 6.49, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.08211508553654744, |
|
"grad_norm": 5.241379261016846, |
|
"learning_rate": 2.7337132953697554e-08, |
|
"loss": 6.8148, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.08294453084499741, |
|
"eval_loss": 6.808236122131348, |
|
"eval_runtime": 26.1262, |
|
"eval_samples_per_second": 155.438, |
|
"eval_steps_per_second": 38.888, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 130988066734080.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|