dada22231's picture
Training in progress, step 95, checkpoint
568a2cc verified
{
"best_metric": 0.26534104347229004,
"best_model_checkpoint": "miner_id_24/checkpoint-75",
"epoch": 0.27012617735916117,
"eval_steps": 25,
"global_step": 95,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0028434334458859074,
"grad_norm": 22.599037170410156,
"learning_rate": 3.3333333333333335e-05,
"loss": 5.3378,
"step": 1
},
{
"epoch": 0.0028434334458859074,
"eval_loss": 5.232264995574951,
"eval_runtime": 3.9625,
"eval_samples_per_second": 12.618,
"eval_steps_per_second": 3.281,
"step": 1
},
{
"epoch": 0.005686866891771815,
"grad_norm": 23.47909164428711,
"learning_rate": 6.666666666666667e-05,
"loss": 5.2597,
"step": 2
},
{
"epoch": 0.008530300337657722,
"grad_norm": 18.171615600585938,
"learning_rate": 0.0001,
"loss": 4.3128,
"step": 3
},
{
"epoch": 0.01137373378354363,
"grad_norm": 14.74112606048584,
"learning_rate": 9.997376600647783e-05,
"loss": 2.5084,
"step": 4
},
{
"epoch": 0.014217167229429535,
"grad_norm": 11.828715324401855,
"learning_rate": 9.989509461357426e-05,
"loss": 1.0016,
"step": 5
},
{
"epoch": 0.017060600675315445,
"grad_norm": 2.351505756378174,
"learning_rate": 9.976407754861426e-05,
"loss": 0.4414,
"step": 6
},
{
"epoch": 0.019904034121201352,
"grad_norm": 5.641177177429199,
"learning_rate": 9.958086757163489e-05,
"loss": 0.4406,
"step": 7
},
{
"epoch": 0.02274746756708726,
"grad_norm": 2.2351808547973633,
"learning_rate": 9.934567829727386e-05,
"loss": 0.3914,
"step": 8
},
{
"epoch": 0.025590901012973163,
"grad_norm": 2.748798131942749,
"learning_rate": 9.905878394570453e-05,
"loss": 0.3943,
"step": 9
},
{
"epoch": 0.02843433445885907,
"grad_norm": 2.5455727577209473,
"learning_rate": 9.872051902290737e-05,
"loss": 0.378,
"step": 10
},
{
"epoch": 0.03127776790474498,
"grad_norm": 3.8824398517608643,
"learning_rate": 9.833127793065098e-05,
"loss": 0.4721,
"step": 11
},
{
"epoch": 0.03412120135063089,
"grad_norm": 3.543416976928711,
"learning_rate": 9.789151450663723e-05,
"loss": 0.3677,
"step": 12
},
{
"epoch": 0.03696463479651679,
"grad_norm": 1.8024444580078125,
"learning_rate": 9.740174149534693e-05,
"loss": 0.3316,
"step": 13
},
{
"epoch": 0.039808068242402704,
"grad_norm": 1.9301329851150513,
"learning_rate": 9.686252995020249e-05,
"loss": 0.3689,
"step": 14
},
{
"epoch": 0.04265150168828861,
"grad_norm": 2.8067333698272705,
"learning_rate": 9.627450856774539e-05,
"loss": 0.3827,
"step": 15
},
{
"epoch": 0.04549493513417452,
"grad_norm": 1.4591389894485474,
"learning_rate": 9.563836295460398e-05,
"loss": 0.3528,
"step": 16
},
{
"epoch": 0.04833836858006042,
"grad_norm": 1.4871190786361694,
"learning_rate": 9.495483482810688e-05,
"loss": 0.3235,
"step": 17
},
{
"epoch": 0.05118180202594633,
"grad_norm": 1.626112937927246,
"learning_rate": 9.422472115147382e-05,
"loss": 0.3317,
"step": 18
},
{
"epoch": 0.05402523547183224,
"grad_norm": 1.4325222969055176,
"learning_rate": 9.3448873204592e-05,
"loss": 0.3437,
"step": 19
},
{
"epoch": 0.05686866891771814,
"grad_norm": 2.8382790088653564,
"learning_rate": 9.2628195591462e-05,
"loss": 0.3757,
"step": 20
},
{
"epoch": 0.05971210236360405,
"grad_norm": 1.625089406967163,
"learning_rate": 9.176364518546989e-05,
"loss": 0.3304,
"step": 21
},
{
"epoch": 0.06255553580948996,
"grad_norm": 1.772882342338562,
"learning_rate": 9.08562300137157e-05,
"loss": 0.3802,
"step": 22
},
{
"epoch": 0.06539896925537586,
"grad_norm": 1.3592987060546875,
"learning_rate": 8.990700808169889e-05,
"loss": 0.3369,
"step": 23
},
{
"epoch": 0.06824240270126178,
"grad_norm": 1.762557864189148,
"learning_rate": 8.891708613973126e-05,
"loss": 0.3376,
"step": 24
},
{
"epoch": 0.07108583614714768,
"grad_norm": 1.9905650615692139,
"learning_rate": 8.788761839251559e-05,
"loss": 0.2961,
"step": 25
},
{
"epoch": 0.07108583614714768,
"eval_loss": 0.3258455991744995,
"eval_runtime": 4.0271,
"eval_samples_per_second": 12.416,
"eval_steps_per_second": 3.228,
"step": 25
},
{
"epoch": 0.07392926959303359,
"grad_norm": 0.7239434719085693,
"learning_rate": 8.681980515339464e-05,
"loss": 0.3381,
"step": 26
},
{
"epoch": 0.07677270303891949,
"grad_norm": 0.8661226034164429,
"learning_rate": 8.571489144483944e-05,
"loss": 0.3448,
"step": 27
},
{
"epoch": 0.07961613648480541,
"grad_norm": 1.6016333103179932,
"learning_rate": 8.457416554680877e-05,
"loss": 0.3474,
"step": 28
},
{
"epoch": 0.08245956993069131,
"grad_norm": 1.2299487590789795,
"learning_rate": 8.339895749467238e-05,
"loss": 0.3493,
"step": 29
},
{
"epoch": 0.08530300337657722,
"grad_norm": 1.4147472381591797,
"learning_rate": 8.219063752844926e-05,
"loss": 0.3162,
"step": 30
},
{
"epoch": 0.08814643682246312,
"grad_norm": 1.3127275705337524,
"learning_rate": 8.095061449516903e-05,
"loss": 0.3306,
"step": 31
},
{
"epoch": 0.09098987026834904,
"grad_norm": 1.7408323287963867,
"learning_rate": 7.968033420621935e-05,
"loss": 0.3163,
"step": 32
},
{
"epoch": 0.09383330371423494,
"grad_norm": 1.461795687675476,
"learning_rate": 7.838127775159452e-05,
"loss": 0.2727,
"step": 33
},
{
"epoch": 0.09667673716012085,
"grad_norm": 1.4612610340118408,
"learning_rate": 7.705495977301078e-05,
"loss": 0.3574,
"step": 34
},
{
"epoch": 0.09952017060600675,
"grad_norm": 1.4245153665542603,
"learning_rate": 7.570292669790186e-05,
"loss": 0.3115,
"step": 35
},
{
"epoch": 0.10236360405189265,
"grad_norm": 1.1729720830917358,
"learning_rate": 7.43267549363537e-05,
"loss": 0.3435,
"step": 36
},
{
"epoch": 0.10520703749777857,
"grad_norm": 0.9695225954055786,
"learning_rate": 7.292804904308087e-05,
"loss": 0.2869,
"step": 37
},
{
"epoch": 0.10805047094366448,
"grad_norm": 0.7456521391868591,
"learning_rate": 7.150843984658754e-05,
"loss": 0.2947,
"step": 38
},
{
"epoch": 0.11089390438955038,
"grad_norm": 0.8560568690299988,
"learning_rate": 7.006958254769438e-05,
"loss": 0.3231,
"step": 39
},
{
"epoch": 0.11373733783543628,
"grad_norm": 0.9865386486053467,
"learning_rate": 6.861315478964841e-05,
"loss": 0.302,
"step": 40
},
{
"epoch": 0.1165807712813222,
"grad_norm": 0.9534958004951477,
"learning_rate": 6.714085470206609e-05,
"loss": 0.3014,
"step": 41
},
{
"epoch": 0.1194242047272081,
"grad_norm": 0.8210452198982239,
"learning_rate": 6.56543989209901e-05,
"loss": 0.2968,
"step": 42
},
{
"epoch": 0.12226763817309401,
"grad_norm": 1.0526001453399658,
"learning_rate": 6.415552058736854e-05,
"loss": 0.3187,
"step": 43
},
{
"epoch": 0.1251110716189799,
"grad_norm": 1.1468490362167358,
"learning_rate": 6.264596732629e-05,
"loss": 0.2978,
"step": 44
},
{
"epoch": 0.12795450506486583,
"grad_norm": 1.0789867639541626,
"learning_rate": 6.112749920933111e-05,
"loss": 0.2907,
"step": 45
},
{
"epoch": 0.13079793851075172,
"grad_norm": 1.4448879957199097,
"learning_rate": 5.960188670239154e-05,
"loss": 0.3332,
"step": 46
},
{
"epoch": 0.13364137195663764,
"grad_norm": 1.1925578117370605,
"learning_rate": 5.80709086014102e-05,
"loss": 0.2723,
"step": 47
},
{
"epoch": 0.13648480540252356,
"grad_norm": 1.3483730554580688,
"learning_rate": 5.653634995836856e-05,
"loss": 0.3051,
"step": 48
},
{
"epoch": 0.13932823884840945,
"grad_norm": 1.0803078413009644,
"learning_rate": 5.500000000000001e-05,
"loss": 0.2854,
"step": 49
},
{
"epoch": 0.14217167229429536,
"grad_norm": 1.0243525505065918,
"learning_rate": 5.346365004163145e-05,
"loss": 0.2797,
"step": 50
},
{
"epoch": 0.14217167229429536,
"eval_loss": 0.27564024925231934,
"eval_runtime": 4.0615,
"eval_samples_per_second": 12.311,
"eval_steps_per_second": 3.201,
"step": 50
},
{
"epoch": 0.14501510574018128,
"grad_norm": 1.0136526823043823,
"learning_rate": 5.192909139858981e-05,
"loss": 0.3151,
"step": 51
},
{
"epoch": 0.14785853918606717,
"grad_norm": 1.1708089113235474,
"learning_rate": 5.0398113297608465e-05,
"loss": 0.2924,
"step": 52
},
{
"epoch": 0.1507019726319531,
"grad_norm": 0.8153492212295532,
"learning_rate": 4.887250079066892e-05,
"loss": 0.2776,
"step": 53
},
{
"epoch": 0.15354540607783898,
"grad_norm": 1.1344809532165527,
"learning_rate": 4.7354032673710005e-05,
"loss": 0.2746,
"step": 54
},
{
"epoch": 0.1563888395237249,
"grad_norm": 1.1401007175445557,
"learning_rate": 4.584447941263149e-05,
"loss": 0.309,
"step": 55
},
{
"epoch": 0.15923227296961082,
"grad_norm": 1.3479905128479004,
"learning_rate": 4.43456010790099e-05,
"loss": 0.2876,
"step": 56
},
{
"epoch": 0.1620757064154967,
"grad_norm": 1.0521594285964966,
"learning_rate": 4.285914529793391e-05,
"loss": 0.2562,
"step": 57
},
{
"epoch": 0.16491913986138262,
"grad_norm": 0.9862216711044312,
"learning_rate": 4.13868452103516e-05,
"loss": 0.2832,
"step": 58
},
{
"epoch": 0.16776257330726851,
"grad_norm": 1.2180746793746948,
"learning_rate": 3.9930417452305626e-05,
"loss": 0.3047,
"step": 59
},
{
"epoch": 0.17060600675315443,
"grad_norm": 1.5846315622329712,
"learning_rate": 3.8491560153412466e-05,
"loss": 0.2768,
"step": 60
},
{
"epoch": 0.17344944019904035,
"grad_norm": 1.3017767667770386,
"learning_rate": 3.707195095691913e-05,
"loss": 0.2877,
"step": 61
},
{
"epoch": 0.17629287364492624,
"grad_norm": 1.473238468170166,
"learning_rate": 3.567324506364632e-05,
"loss": 0.2664,
"step": 62
},
{
"epoch": 0.17913630709081216,
"grad_norm": 1.2154439687728882,
"learning_rate": 3.4297073302098156e-05,
"loss": 0.2768,
"step": 63
},
{
"epoch": 0.18197974053669808,
"grad_norm": 1.683435082435608,
"learning_rate": 3.2945040226989244e-05,
"loss": 0.2896,
"step": 64
},
{
"epoch": 0.18482317398258397,
"grad_norm": 1.3778927326202393,
"learning_rate": 3.16187222484055e-05,
"loss": 0.295,
"step": 65
},
{
"epoch": 0.18766660742846988,
"grad_norm": 1.4608365297317505,
"learning_rate": 3.0319665793780648e-05,
"loss": 0.2611,
"step": 66
},
{
"epoch": 0.19051004087435577,
"grad_norm": 1.2854911088943481,
"learning_rate": 2.9049385504830985e-05,
"loss": 0.2541,
"step": 67
},
{
"epoch": 0.1933534743202417,
"grad_norm": 1.3991892337799072,
"learning_rate": 2.7809362471550748e-05,
"loss": 0.2624,
"step": 68
},
{
"epoch": 0.1961969077661276,
"grad_norm": 1.1210330724716187,
"learning_rate": 2.660104250532764e-05,
"loss": 0.2684,
"step": 69
},
{
"epoch": 0.1990403412120135,
"grad_norm": 1.145573616027832,
"learning_rate": 2.5425834453191232e-05,
"loss": 0.2504,
"step": 70
},
{
"epoch": 0.20188377465789942,
"grad_norm": 0.9745646715164185,
"learning_rate": 2.4285108555160577e-05,
"loss": 0.249,
"step": 71
},
{
"epoch": 0.2047272081037853,
"grad_norm": 1.0583624839782715,
"learning_rate": 2.3180194846605367e-05,
"loss": 0.2482,
"step": 72
},
{
"epoch": 0.20757064154967123,
"grad_norm": 1.106729507446289,
"learning_rate": 2.2112381607484417e-05,
"loss": 0.2899,
"step": 73
},
{
"epoch": 0.21041407499555714,
"grad_norm": 1.0823482275009155,
"learning_rate": 2.1082913860268765e-05,
"loss": 0.2543,
"step": 74
},
{
"epoch": 0.21325750844144303,
"grad_norm": 0.9167066216468811,
"learning_rate": 2.0092991918301108e-05,
"loss": 0.2498,
"step": 75
},
{
"epoch": 0.21325750844144303,
"eval_loss": 0.26534104347229004,
"eval_runtime": 4.0551,
"eval_samples_per_second": 12.33,
"eval_steps_per_second": 3.206,
"step": 75
},
{
"epoch": 0.21610094188732895,
"grad_norm": 1.3717762231826782,
"learning_rate": 1.91437699862843e-05,
"loss": 0.2886,
"step": 76
},
{
"epoch": 0.21894437533321487,
"grad_norm": 1.5089960098266602,
"learning_rate": 1.8236354814530112e-05,
"loss": 0.3046,
"step": 77
},
{
"epoch": 0.22178780877910076,
"grad_norm": 1.2876425981521606,
"learning_rate": 1.7371804408538024e-05,
"loss": 0.2682,
"step": 78
},
{
"epoch": 0.22463124222498668,
"grad_norm": 1.4826658964157104,
"learning_rate": 1.6551126795408016e-05,
"loss": 0.2612,
"step": 79
},
{
"epoch": 0.22747467567087257,
"grad_norm": 1.2903155088424683,
"learning_rate": 1.577527884852619e-05,
"loss": 0.2599,
"step": 80
},
{
"epoch": 0.23031810911675848,
"grad_norm": 1.2806668281555176,
"learning_rate": 1.5045165171893116e-05,
"loss": 0.2419,
"step": 81
},
{
"epoch": 0.2331615425626444,
"grad_norm": 1.1345335245132446,
"learning_rate": 1.4361637045396029e-05,
"loss": 0.2579,
"step": 82
},
{
"epoch": 0.2360049760085303,
"grad_norm": 1.1329256296157837,
"learning_rate": 1.3725491432254624e-05,
"loss": 0.2659,
"step": 83
},
{
"epoch": 0.2388484094544162,
"grad_norm": 1.2562272548675537,
"learning_rate": 1.313747004979751e-05,
"loss": 0.2676,
"step": 84
},
{
"epoch": 0.24169184290030213,
"grad_norm": 0.992720901966095,
"learning_rate": 1.2598258504653081e-05,
"loss": 0.2283,
"step": 85
},
{
"epoch": 0.24453527634618802,
"grad_norm": 1.2588672637939453,
"learning_rate": 1.2108485493362765e-05,
"loss": 0.2456,
"step": 86
},
{
"epoch": 0.24737870979207394,
"grad_norm": 0.9704244136810303,
"learning_rate": 1.1668722069349041e-05,
"loss": 0.2605,
"step": 87
},
{
"epoch": 0.2502221432379598,
"grad_norm": 0.993431031703949,
"learning_rate": 1.1279480977092635e-05,
"loss": 0.2861,
"step": 88
},
{
"epoch": 0.2530655766838457,
"grad_norm": 1.2297654151916504,
"learning_rate": 1.094121605429547e-05,
"loss": 0.2832,
"step": 89
},
{
"epoch": 0.25590901012973166,
"grad_norm": 1.498766303062439,
"learning_rate": 1.0654321702726141e-05,
"loss": 0.271,
"step": 90
},
{
"epoch": 0.25875244357561755,
"grad_norm": 1.326579213142395,
"learning_rate": 1.0419132428365116e-05,
"loss": 0.2704,
"step": 91
},
{
"epoch": 0.26159587702150344,
"grad_norm": 1.417798399925232,
"learning_rate": 1.0235922451385733e-05,
"loss": 0.2509,
"step": 92
},
{
"epoch": 0.2644393104673894,
"grad_norm": 1.0304580926895142,
"learning_rate": 1.0104905386425733e-05,
"loss": 0.2516,
"step": 93
},
{
"epoch": 0.2672827439132753,
"grad_norm": 1.153357982635498,
"learning_rate": 1.002623399352217e-05,
"loss": 0.2606,
"step": 94
},
{
"epoch": 0.27012617735916117,
"grad_norm": 1.3540794849395752,
"learning_rate": 1e-05,
"loss": 0.267,
"step": 95
}
],
"logging_steps": 1,
"max_steps": 95,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.173151075448914e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}