oldiday's picture
Training in progress, step 200, checkpoint
3ad9070 verified
{
"best_metric": 6.808236122131348,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.08294453084499741,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00041472265422498703,
"eval_loss": 10.22995376586914,
"eval_runtime": 26.0494,
"eval_samples_per_second": 155.896,
"eval_steps_per_second": 39.003,
"step": 1
},
{
"epoch": 0.001244167962674961,
"grad_norm": 8.375945091247559,
"learning_rate": 3e-05,
"loss": 10.1464,
"step": 3
},
{
"epoch": 0.002488335925349922,
"grad_norm": 8.747570991516113,
"learning_rate": 6e-05,
"loss": 9.8523,
"step": 6
},
{
"epoch": 0.0037325038880248835,
"grad_norm": 8.138151168823242,
"learning_rate": 9e-05,
"loss": 9.5672,
"step": 9
},
{
"epoch": 0.004976671850699844,
"grad_norm": 6.871833324432373,
"learning_rate": 9.997266286704631e-05,
"loss": 9.2243,
"step": 12
},
{
"epoch": 0.006220839813374806,
"grad_norm": 6.836878776550293,
"learning_rate": 9.98292246503335e-05,
"loss": 9.0827,
"step": 15
},
{
"epoch": 0.007465007776049767,
"grad_norm": 6.312921047210693,
"learning_rate": 9.956320346634876e-05,
"loss": 8.8257,
"step": 18
},
{
"epoch": 0.008709175738724729,
"grad_norm": 5.9714837074279785,
"learning_rate": 9.917525374361912e-05,
"loss": 8.521,
"step": 21
},
{
"epoch": 0.009953343701399688,
"grad_norm": 4.645086288452148,
"learning_rate": 9.86663298624003e-05,
"loss": 8.1377,
"step": 24
},
{
"epoch": 0.01119751166407465,
"grad_norm": 4.165164470672607,
"learning_rate": 9.803768380684242e-05,
"loss": 7.9853,
"step": 27
},
{
"epoch": 0.012441679626749611,
"grad_norm": 5.4236555099487305,
"learning_rate": 9.729086208503174e-05,
"loss": 7.9031,
"step": 30
},
{
"epoch": 0.013685847589424573,
"grad_norm": 4.232346057891846,
"learning_rate": 9.642770192448536e-05,
"loss": 7.7511,
"step": 33
},
{
"epoch": 0.014930015552099534,
"grad_norm": 3.674351215362549,
"learning_rate": 9.545032675245813e-05,
"loss": 7.4404,
"step": 36
},
{
"epoch": 0.016174183514774496,
"grad_norm": 4.119446277618408,
"learning_rate": 9.43611409721806e-05,
"loss": 7.2037,
"step": 39
},
{
"epoch": 0.017418351477449457,
"grad_norm": 4.06898307800293,
"learning_rate": 9.316282404787871e-05,
"loss": 7.3253,
"step": 42
},
{
"epoch": 0.01866251944012442,
"grad_norm": 4.10125207901001,
"learning_rate": 9.185832391312644e-05,
"loss": 7.2997,
"step": 45
},
{
"epoch": 0.019906687402799376,
"grad_norm": 4.548858642578125,
"learning_rate": 9.045084971874738e-05,
"loss": 7.4761,
"step": 48
},
{
"epoch": 0.020736132711249352,
"eval_loss": 7.311596870422363,
"eval_runtime": 26.1105,
"eval_samples_per_second": 155.531,
"eval_steps_per_second": 38.912,
"step": 50
},
{
"epoch": 0.021150855365474338,
"grad_norm": 5.925373554229736,
"learning_rate": 8.894386393810563e-05,
"loss": 7.2564,
"step": 51
},
{
"epoch": 0.0223950233281493,
"grad_norm": 5.66447114944458,
"learning_rate": 8.73410738492077e-05,
"loss": 7.397,
"step": 54
},
{
"epoch": 0.02363919129082426,
"grad_norm": 3.098438024520874,
"learning_rate": 8.564642241456986e-05,
"loss": 7.2778,
"step": 57
},
{
"epoch": 0.024883359253499222,
"grad_norm": 4.301871299743652,
"learning_rate": 8.386407858128706e-05,
"loss": 7.1488,
"step": 60
},
{
"epoch": 0.026127527216174184,
"grad_norm": 3.441345453262329,
"learning_rate": 8.199842702516583e-05,
"loss": 7.6062,
"step": 63
},
{
"epoch": 0.027371695178849145,
"grad_norm": 3.739030361175537,
"learning_rate": 8.005405736415126e-05,
"loss": 7.151,
"step": 66
},
{
"epoch": 0.028615863141524107,
"grad_norm": 3.039222478866577,
"learning_rate": 7.803575286758364e-05,
"loss": 7.0776,
"step": 69
},
{
"epoch": 0.029860031104199068,
"grad_norm": 3.4319775104522705,
"learning_rate": 7.594847868906076e-05,
"loss": 6.8775,
"step": 72
},
{
"epoch": 0.03110419906687403,
"grad_norm": 3.82979416847229,
"learning_rate": 7.379736965185368e-05,
"loss": 7.171,
"step": 75
},
{
"epoch": 0.03234836702954899,
"grad_norm": 3.212101697921753,
"learning_rate": 7.158771761692464e-05,
"loss": 7.223,
"step": 78
},
{
"epoch": 0.03359253499222395,
"grad_norm": 2.7414333820343018,
"learning_rate": 6.932495846462261e-05,
"loss": 6.8763,
"step": 81
},
{
"epoch": 0.034836702954898914,
"grad_norm": 3.106311082839966,
"learning_rate": 6.701465872208216e-05,
"loss": 7.24,
"step": 84
},
{
"epoch": 0.03608087091757387,
"grad_norm": 3.515357494354248,
"learning_rate": 6.466250186922325e-05,
"loss": 6.9001,
"step": 87
},
{
"epoch": 0.03732503888024884,
"grad_norm": 3.5490942001342773,
"learning_rate": 6.227427435703997e-05,
"loss": 6.8924,
"step": 90
},
{
"epoch": 0.038569206842923795,
"grad_norm": 3.3202061653137207,
"learning_rate": 5.985585137257401e-05,
"loss": 6.8308,
"step": 93
},
{
"epoch": 0.03981337480559875,
"grad_norm": 4.054980754852295,
"learning_rate": 5.74131823855921e-05,
"loss": 6.9539,
"step": 96
},
{
"epoch": 0.04105754276827372,
"grad_norm": 3.8746256828308105,
"learning_rate": 5.495227651252315e-05,
"loss": 6.7836,
"step": 99
},
{
"epoch": 0.041472265422498704,
"eval_loss": 6.94667911529541,
"eval_runtime": 26.1302,
"eval_samples_per_second": 155.414,
"eval_steps_per_second": 38.882,
"step": 100
},
{
"epoch": 0.042301710730948676,
"grad_norm": 6.138733863830566,
"learning_rate": 5.247918773366112e-05,
"loss": 7.08,
"step": 102
},
{
"epoch": 0.04354587869362364,
"grad_norm": 3.955928087234497,
"learning_rate": 5e-05,
"loss": 7.2784,
"step": 105
},
{
"epoch": 0.0447900466562986,
"grad_norm": 2.847581624984741,
"learning_rate": 4.7520812266338885e-05,
"loss": 7.0751,
"step": 108
},
{
"epoch": 0.046034214618973564,
"grad_norm": 3.7220466136932373,
"learning_rate": 4.504772348747687e-05,
"loss": 6.8847,
"step": 111
},
{
"epoch": 0.04727838258164852,
"grad_norm": 3.8197450637817383,
"learning_rate": 4.2586817614407895e-05,
"loss": 7.1759,
"step": 114
},
{
"epoch": 0.04852255054432349,
"grad_norm": 3.3311004638671875,
"learning_rate": 4.0144148627425993e-05,
"loss": 6.7715,
"step": 117
},
{
"epoch": 0.049766718506998445,
"grad_norm": 3.1715171337127686,
"learning_rate": 3.772572564296005e-05,
"loss": 7.196,
"step": 120
},
{
"epoch": 0.0510108864696734,
"grad_norm": 3.3263697624206543,
"learning_rate": 3.533749813077677e-05,
"loss": 6.9274,
"step": 123
},
{
"epoch": 0.05225505443234837,
"grad_norm": 2.904386281967163,
"learning_rate": 3.298534127791785e-05,
"loss": 6.757,
"step": 126
},
{
"epoch": 0.053499222395023326,
"grad_norm": 3.113102436065674,
"learning_rate": 3.0675041535377405e-05,
"loss": 6.619,
"step": 129
},
{
"epoch": 0.05474339035769829,
"grad_norm": 3.046929359436035,
"learning_rate": 2.8412282383075363e-05,
"loss": 6.7691,
"step": 132
},
{
"epoch": 0.05598755832037325,
"grad_norm": 3.922736406326294,
"learning_rate": 2.6202630348146324e-05,
"loss": 6.6507,
"step": 135
},
{
"epoch": 0.05723172628304821,
"grad_norm": 3.2588577270507812,
"learning_rate": 2.405152131093926e-05,
"loss": 6.6191,
"step": 138
},
{
"epoch": 0.05847589424572317,
"grad_norm": 3.445479154586792,
"learning_rate": 2.196424713241637e-05,
"loss": 6.5481,
"step": 141
},
{
"epoch": 0.059720062208398136,
"grad_norm": 4.105072021484375,
"learning_rate": 1.9945942635848748e-05,
"loss": 6.6649,
"step": 144
},
{
"epoch": 0.060964230171073094,
"grad_norm": 4.807459831237793,
"learning_rate": 1.800157297483417e-05,
"loss": 6.343,
"step": 147
},
{
"epoch": 0.06220839813374806,
"grad_norm": 6.678823471069336,
"learning_rate": 1.6135921418712956e-05,
"loss": 6.5526,
"step": 150
},
{
"epoch": 0.06220839813374806,
"eval_loss": 6.823774814605713,
"eval_runtime": 26.1162,
"eval_samples_per_second": 155.498,
"eval_steps_per_second": 38.903,
"step": 150
},
{
"epoch": 0.06345256609642301,
"grad_norm": 3.967849016189575,
"learning_rate": 1.435357758543015e-05,
"loss": 6.9811,
"step": 153
},
{
"epoch": 0.06469673405909798,
"grad_norm": 4.2464470863342285,
"learning_rate": 1.2658926150792322e-05,
"loss": 7.043,
"step": 156
},
{
"epoch": 0.06594090202177294,
"grad_norm": 4.486046314239502,
"learning_rate": 1.1056136061894384e-05,
"loss": 6.8117,
"step": 159
},
{
"epoch": 0.0671850699844479,
"grad_norm": 3.788048505783081,
"learning_rate": 9.549150281252633e-06,
"loss": 7.0248,
"step": 162
},
{
"epoch": 0.06842923794712286,
"grad_norm": 3.6193251609802246,
"learning_rate": 8.141676086873572e-06,
"loss": 6.9322,
"step": 165
},
{
"epoch": 0.06967340590979783,
"grad_norm": 3.7306830883026123,
"learning_rate": 6.837175952121306e-06,
"loss": 7.1483,
"step": 168
},
{
"epoch": 0.07091757387247279,
"grad_norm": 3.2535622119903564,
"learning_rate": 5.6388590278194096e-06,
"loss": 6.8575,
"step": 171
},
{
"epoch": 0.07216174183514774,
"grad_norm": 2.966529607772827,
"learning_rate": 4.549673247541875e-06,
"loss": 6.8145,
"step": 174
},
{
"epoch": 0.0734059097978227,
"grad_norm": 2.99233078956604,
"learning_rate": 3.5722980755146517e-06,
"loss": 6.9243,
"step": 177
},
{
"epoch": 0.07465007776049767,
"grad_norm": 3.4011435508728027,
"learning_rate": 2.7091379149682685e-06,
"loss": 6.7688,
"step": 180
},
{
"epoch": 0.07589424572317263,
"grad_norm": 3.149674892425537,
"learning_rate": 1.962316193157593e-06,
"loss": 6.7597,
"step": 183
},
{
"epoch": 0.07713841368584759,
"grad_norm": 3.01259446144104,
"learning_rate": 1.333670137599713e-06,
"loss": 6.6439,
"step": 186
},
{
"epoch": 0.07838258164852255,
"grad_norm": 3.5751147270202637,
"learning_rate": 8.247462563808817e-07,
"loss": 6.6211,
"step": 189
},
{
"epoch": 0.0796267496111975,
"grad_norm": 3.9128057956695557,
"learning_rate": 4.367965336512403e-07,
"loss": 6.7342,
"step": 192
},
{
"epoch": 0.08087091757387248,
"grad_norm": 3.7299938201904297,
"learning_rate": 1.7077534966650766e-07,
"loss": 6.49,
"step": 195
},
{
"epoch": 0.08211508553654744,
"grad_norm": 5.241379261016846,
"learning_rate": 2.7337132953697554e-08,
"loss": 6.8148,
"step": 198
},
{
"epoch": 0.08294453084499741,
"eval_loss": 6.808236122131348,
"eval_runtime": 26.1262,
"eval_samples_per_second": 155.438,
"eval_steps_per_second": 38.888,
"step": 200
}
],
"logging_steps": 3,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 130988066734080.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}