emessy's picture
Checkpoint at step 9984
0372e32 verified
{
"best_metric": 1.613356113433838,
"best_model_checkpoint": "output/checkpoint-6000",
"epoch": 3.0,
"eval_steps": 2000,
"global_step": 9984,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.030048076923076924,
"grad_norm": 2.3787589073181152,
"learning_rate": 2e-05,
"loss": 3.6173,
"step": 100
},
{
"epoch": 0.06009615384615385,
"grad_norm": 1.996222734451294,
"learning_rate": 4e-05,
"loss": 1.7498,
"step": 200
},
{
"epoch": 0.09014423076923077,
"grad_norm": 1.5797070264816284,
"learning_rate": 6e-05,
"loss": 1.7224,
"step": 300
},
{
"epoch": 0.1201923076923077,
"grad_norm": 1.4705523252487183,
"learning_rate": 8e-05,
"loss": 1.6887,
"step": 400
},
{
"epoch": 0.1502403846153846,
"grad_norm": 1.807113766670227,
"learning_rate": 0.0001,
"loss": 1.6883,
"step": 500
},
{
"epoch": 0.18028846153846154,
"grad_norm": 1.211004376411438,
"learning_rate": 9.99725705593595e-05,
"loss": 1.6908,
"step": 600
},
{
"epoch": 0.21033653846153846,
"grad_norm": 1.194908857345581,
"learning_rate": 9.989031233240653e-05,
"loss": 1.6865,
"step": 700
},
{
"epoch": 0.2403846153846154,
"grad_norm": 1.125349998474121,
"learning_rate": 9.975331557102723e-05,
"loss": 1.6796,
"step": 800
},
{
"epoch": 0.2704326923076923,
"grad_norm": 1.2960246801376343,
"learning_rate": 9.9561730585003e-05,
"loss": 1.6697,
"step": 900
},
{
"epoch": 0.3004807692307692,
"grad_norm": 1.1631627082824707,
"learning_rate": 9.931576757709384e-05,
"loss": 1.6675,
"step": 1000
},
{
"epoch": 0.33052884615384615,
"grad_norm": 1.0823206901550293,
"learning_rate": 9.901569641240883e-05,
"loss": 1.6532,
"step": 1100
},
{
"epoch": 0.3605769230769231,
"grad_norm": 1.1251226663589478,
"learning_rate": 9.866184632231592e-05,
"loss": 1.6625,
"step": 1200
},
{
"epoch": 0.390625,
"grad_norm": 1.27994704246521,
"learning_rate": 9.825460554321679e-05,
"loss": 1.6463,
"step": 1300
},
{
"epoch": 0.4206730769230769,
"grad_norm": 1.3841296434402466,
"learning_rate": 9.779442089058252e-05,
"loss": 1.6462,
"step": 1400
},
{
"epoch": 0.45072115384615385,
"grad_norm": 1.1315901279449463,
"learning_rate": 9.728179726871762e-05,
"loss": 1.6452,
"step": 1500
},
{
"epoch": 0.4807692307692308,
"grad_norm": 1.2269665002822876,
"learning_rate": 9.671729711679036e-05,
"loss": 1.6449,
"step": 1600
},
{
"epoch": 0.5108173076923077,
"grad_norm": 1.135764479637146,
"learning_rate": 9.610153979173711e-05,
"loss": 1.6367,
"step": 1700
},
{
"epoch": 0.5408653846153846,
"grad_norm": 1.2129323482513428,
"learning_rate": 9.543520088871773e-05,
"loss": 1.6271,
"step": 1800
},
{
"epoch": 0.5709134615384616,
"grad_norm": 1.2092684507369995,
"learning_rate": 9.471901149986767e-05,
"loss": 1.6353,
"step": 1900
},
{
"epoch": 0.6009615384615384,
"grad_norm": 1.2054263353347778,
"learning_rate": 9.39537574121601e-05,
"loss": 1.6386,
"step": 2000
},
{
"epoch": 0.6009615384615384,
"eval_loss": 1.627854585647583,
"eval_runtime": 242.6417,
"eval_samples_per_second": 73.141,
"eval_steps_per_second": 9.145,
"step": 2000
},
{
"epoch": 0.6310096153846154,
"grad_norm": 1.4155864715576172,
"learning_rate": 9.314027824525798e-05,
"loss": 1.6322,
"step": 2100
},
{
"epoch": 0.6610576923076923,
"grad_norm": 1.2875721454620361,
"learning_rate": 9.22794665303021e-05,
"loss": 1.6205,
"step": 2200
},
{
"epoch": 0.6911057692307693,
"grad_norm": 1.2036750316619873,
"learning_rate": 9.137226673064603e-05,
"loss": 1.6201,
"step": 2300
},
{
"epoch": 0.7211538461538461,
"grad_norm": 1.3741754293441772,
"learning_rate": 9.04196742056119e-05,
"loss": 1.6197,
"step": 2400
},
{
"epoch": 0.7512019230769231,
"grad_norm": 1.3001148700714111,
"learning_rate": 8.942273411840452e-05,
"loss": 1.6285,
"step": 2500
},
{
"epoch": 0.78125,
"grad_norm": 1.3223652839660645,
"learning_rate": 8.838254028938162e-05,
"loss": 1.6323,
"step": 2600
},
{
"epoch": 0.8112980769230769,
"grad_norm": 1.399418592453003,
"learning_rate": 8.730023399593876e-05,
"loss": 1.6184,
"step": 2700
},
{
"epoch": 0.8413461538461539,
"grad_norm": 1.2166377305984497,
"learning_rate": 8.617700272032516e-05,
"loss": 1.6165,
"step": 2800
},
{
"epoch": 0.8713942307692307,
"grad_norm": 1.208473563194275,
"learning_rate": 8.501407884676479e-05,
"loss": 1.616,
"step": 2900
},
{
"epoch": 0.9014423076923077,
"grad_norm": 1.2277066707611084,
"learning_rate": 8.381273830931207e-05,
"loss": 1.6122,
"step": 3000
},
{
"epoch": 0.9314903846153846,
"grad_norm": 1.204302191734314,
"learning_rate": 8.257429919192542e-05,
"loss": 1.6186,
"step": 3100
},
{
"epoch": 0.9615384615384616,
"grad_norm": 1.2529523372650146,
"learning_rate": 8.130012028229512e-05,
"loss": 1.6164,
"step": 3200
},
{
"epoch": 0.9915865384615384,
"grad_norm": 1.513980507850647,
"learning_rate": 7.999159958101186e-05,
"loss": 1.5971,
"step": 3300
},
{
"epoch": 1.0216346153846154,
"grad_norm": 1.420433521270752,
"learning_rate": 7.865017276771173e-05,
"loss": 1.4976,
"step": 3400
},
{
"epoch": 1.0516826923076923,
"grad_norm": 1.4616764783859253,
"learning_rate": 7.727731162588074e-05,
"loss": 1.4486,
"step": 3500
},
{
"epoch": 1.0817307692307692,
"grad_norm": 1.4156601428985596,
"learning_rate": 7.587452242804676e-05,
"loss": 1.4467,
"step": 3600
},
{
"epoch": 1.1117788461538463,
"grad_norm": 1.3352571725845337,
"learning_rate": 7.444334428313112e-05,
"loss": 1.4516,
"step": 3700
},
{
"epoch": 1.1418269230769231,
"grad_norm": 1.4259686470031738,
"learning_rate": 7.298534744777267e-05,
"loss": 1.4466,
"step": 3800
},
{
"epoch": 1.171875,
"grad_norm": 1.4755374193191528,
"learning_rate": 7.150213160347743e-05,
"loss": 1.446,
"step": 3900
},
{
"epoch": 1.2019230769230769,
"grad_norm": 1.4399892091751099,
"learning_rate": 6.999532410148371e-05,
"loss": 1.4331,
"step": 4000
},
{
"epoch": 1.2019230769230769,
"eval_loss": 1.6244958639144897,
"eval_runtime": 248.0586,
"eval_samples_per_second": 71.544,
"eval_steps_per_second": 8.945,
"step": 4000
},
{
"epoch": 1.2319711538461537,
"grad_norm": 1.629622220993042,
"learning_rate": 6.846657817726882e-05,
"loss": 1.4356,
"step": 4100
},
{
"epoch": 1.2620192307692308,
"grad_norm": 1.5240803956985474,
"learning_rate": 6.691757113665606e-05,
"loss": 1.4403,
"step": 4200
},
{
"epoch": 1.2920673076923077,
"grad_norm": 1.947218894958496,
"learning_rate": 6.535000251551231e-05,
"loss": 1.452,
"step": 4300
},
{
"epoch": 1.3221153846153846,
"grad_norm": 1.6493359804153442,
"learning_rate": 6.376559221505535e-05,
"loss": 1.4435,
"step": 4400
},
{
"epoch": 1.3521634615384617,
"grad_norm": 1.6366957426071167,
"learning_rate": 6.216607861481659e-05,
"loss": 1.4385,
"step": 4500
},
{
"epoch": 1.3822115384615383,
"grad_norm": 1.679699182510376,
"learning_rate": 6.055321666533013e-05,
"loss": 1.4509,
"step": 4600
},
{
"epoch": 1.4122596153846154,
"grad_norm": 1.5405994653701782,
"learning_rate": 5.8928775962640146e-05,
"loss": 1.4375,
"step": 4700
},
{
"epoch": 1.4423076923076923,
"grad_norm": 1.5734689235687256,
"learning_rate": 5.7294538806739775e-05,
"loss": 1.4315,
"step": 4800
},
{
"epoch": 1.4723557692307692,
"grad_norm": 1.6284011602401733,
"learning_rate": 5.565229824607143e-05,
"loss": 1.4457,
"step": 4900
},
{
"epoch": 1.5024038461538463,
"grad_norm": 1.5765283107757568,
"learning_rate": 5.400385611023416e-05,
"loss": 1.4374,
"step": 5000
},
{
"epoch": 1.5324519230769231,
"grad_norm": 1.7722498178482056,
"learning_rate": 5.235102103305654e-05,
"loss": 1.4513,
"step": 5100
},
{
"epoch": 1.5625,
"grad_norm": 1.6080434322357178,
"learning_rate": 5.0695606468204095e-05,
"loss": 1.4322,
"step": 5200
},
{
"epoch": 1.5925480769230769,
"grad_norm": 1.7113045454025269,
"learning_rate": 4.90394286994985e-05,
"loss": 1.4372,
"step": 5300
},
{
"epoch": 1.6225961538461537,
"grad_norm": 1.595045566558838,
"learning_rate": 4.738430484813162e-05,
"loss": 1.4391,
"step": 5400
},
{
"epoch": 1.6526442307692308,
"grad_norm": 1.5261282920837402,
"learning_rate": 4.5732050878960816e-05,
"loss": 1.4375,
"step": 5500
},
{
"epoch": 1.6826923076923077,
"grad_norm": 1.6319518089294434,
"learning_rate": 4.40844796080729e-05,
"loss": 1.4269,
"step": 5600
},
{
"epoch": 1.7127403846153846,
"grad_norm": 1.6978216171264648,
"learning_rate": 4.244339871380291e-05,
"loss": 1.4261,
"step": 5700
},
{
"epoch": 1.7427884615384617,
"grad_norm": 1.748810887336731,
"learning_rate": 4.0810608753389864e-05,
"loss": 1.4349,
"step": 5800
},
{
"epoch": 1.7728365384615383,
"grad_norm": 1.4340012073516846,
"learning_rate": 3.9187901187445675e-05,
"loss": 1.4349,
"step": 5900
},
{
"epoch": 1.8028846153846154,
"grad_norm": 1.7568167448043823,
"learning_rate": 3.757705641440461e-05,
"loss": 1.4318,
"step": 6000
},
{
"epoch": 1.8028846153846154,
"eval_loss": 1.613356113433838,
"eval_runtime": 246.395,
"eval_samples_per_second": 72.027,
"eval_steps_per_second": 9.006,
"step": 6000
},
{
"epoch": 1.8329326923076923,
"grad_norm": 1.7349011898040771,
"learning_rate": 3.5979841817110014e-05,
"loss": 1.4335,
"step": 6100
},
{
"epoch": 1.8629807692307692,
"grad_norm": 1.9418445825576782,
"learning_rate": 3.439800982368133e-05,
"loss": 1.4282,
"step": 6200
},
{
"epoch": 1.8930288461538463,
"grad_norm": 1.6687694787979126,
"learning_rate": 3.283329598478926e-05,
"loss": 1.4309,
"step": 6300
},
{
"epoch": 1.9230769230769231,
"grad_norm": 1.694411039352417,
"learning_rate": 3.128741706944832e-05,
"loss": 1.4178,
"step": 6400
},
{
"epoch": 1.953125,
"grad_norm": 1.5258992910385132,
"learning_rate": 2.976206918141635e-05,
"loss": 1.4322,
"step": 6500
},
{
"epoch": 1.9831730769230769,
"grad_norm": 1.8099026679992676,
"learning_rate": 2.8258925898267385e-05,
"loss": 1.416,
"step": 6600
},
{
"epoch": 2.0132211538461537,
"grad_norm": 2.1050777435302734,
"learning_rate": 2.6779636435179777e-05,
"loss": 1.3215,
"step": 6700
},
{
"epoch": 2.043269230769231,
"grad_norm": 2.225262403488159,
"learning_rate": 2.5325823835454278e-05,
"loss": 1.1716,
"step": 6800
},
{
"epoch": 2.0733173076923075,
"grad_norm": 2.9141433238983154,
"learning_rate": 2.3899083189747123e-05,
"loss": 1.1695,
"step": 6900
},
{
"epoch": 2.1033653846153846,
"grad_norm": 2.4766488075256348,
"learning_rate": 2.250097988597234e-05,
"loss": 1.1692,
"step": 7000
},
{
"epoch": 2.1334134615384617,
"grad_norm": 2.1980690956115723,
"learning_rate": 2.1133047891793174e-05,
"loss": 1.1755,
"step": 7100
},
{
"epoch": 2.1634615384615383,
"grad_norm": 2.393937587738037,
"learning_rate": 1.979678807158698e-05,
"loss": 1.1536,
"step": 7200
},
{
"epoch": 2.1935096153846154,
"grad_norm": 2.4240365028381348,
"learning_rate": 1.8493666539730515e-05,
"loss": 1.169,
"step": 7300
},
{
"epoch": 2.2235576923076925,
"grad_norm": 2.4492228031158447,
"learning_rate": 1.7225113052011964e-05,
"loss": 1.1532,
"step": 7400
},
{
"epoch": 2.253605769230769,
"grad_norm": 2.1071839332580566,
"learning_rate": 1.5992519436935022e-05,
"loss": 1.1595,
"step": 7500
},
{
"epoch": 2.2836538461538463,
"grad_norm": 2.5761914253234863,
"learning_rate": 1.4797238068635566e-05,
"loss": 1.1628,
"step": 7600
},
{
"epoch": 2.313701923076923,
"grad_norm": 2.590533971786499,
"learning_rate": 1.3640580383087232e-05,
"loss": 1.1634,
"step": 7700
},
{
"epoch": 2.34375,
"grad_norm": 2.3521673679351807,
"learning_rate": 1.252381543922313e-05,
"loss": 1.1545,
"step": 7800
},
{
"epoch": 2.373798076923077,
"grad_norm": 2.7083418369293213,
"learning_rate": 1.1448168526552727e-05,
"loss": 1.1542,
"step": 7900
},
{
"epoch": 2.4038461538461537,
"grad_norm": 2.427485227584839,
"learning_rate": 1.0414819820801663e-05,
"loss": 1.1633,
"step": 8000
},
{
"epoch": 2.4038461538461537,
"eval_loss": 1.730972409248352,
"eval_runtime": 246.4517,
"eval_samples_per_second": 72.01,
"eval_steps_per_second": 9.004,
"step": 8000
},
{
"epoch": 2.433894230769231,
"grad_norm": 2.347303628921509,
"learning_rate": 9.424903089049375e-06,
"loss": 1.1631,
"step": 8100
},
{
"epoch": 2.4639423076923075,
"grad_norm": 2.697683095932007,
"learning_rate": 8.479504445785158e-06,
"loss": 1.1569,
"step": 8200
},
{
"epoch": 2.4939903846153846,
"grad_norm": 2.401367425918579,
"learning_rate": 7.5796611612476916e-06,
"loss": 1.1482,
"step": 8300
},
{
"epoch": 2.5240384615384617,
"grad_norm": 2.610416889190674,
"learning_rate": 6.726360523355324e-06,
"loss": 1.1614,
"step": 8400
},
{
"epoch": 2.5540865384615383,
"grad_norm": 2.711500644683838,
"learning_rate": 5.920538754475901e-06,
"loss": 1.1555,
"step": 8500
},
{
"epoch": 2.5841346153846154,
"grad_norm": 2.241981029510498,
"learning_rate": 5.163079984224467e-06,
"loss": 1.1707,
"step": 8600
},
{
"epoch": 2.6141826923076925,
"grad_norm": 2.55198073387146,
"learning_rate": 4.454815279416058e-06,
"loss": 1.1605,
"step": 8700
},
{
"epoch": 2.644230769230769,
"grad_norm": 2.5612592697143555,
"learning_rate": 3.7965217322378287e-06,
"loss": 1.1629,
"step": 8800
},
{
"epoch": 2.6742788461538463,
"grad_norm": 2.2738687992095947,
"learning_rate": 3.188921607640816e-06,
"loss": 1.1419,
"step": 8900
},
{
"epoch": 2.7043269230769234,
"grad_norm": 2.611133575439453,
"learning_rate": 2.6326815508870616e-06,
"loss": 1.1618,
"step": 9000
},
{
"epoch": 2.734375,
"grad_norm": 3.1775176525115967,
"learning_rate": 2.1284118561212986e-06,
"loss": 1.1419,
"step": 9100
},
{
"epoch": 2.7644230769230766,
"grad_norm": 2.3005447387695312,
"learning_rate": 1.6766657967699163e-06,
"loss": 1.1591,
"step": 9200
},
{
"epoch": 2.7944711538461537,
"grad_norm": 2.749657154083252,
"learning_rate": 1.2779390185016838e-06,
"loss": 1.1565,
"step": 9300
},
{
"epoch": 2.824519230769231,
"grad_norm": 2.717308521270752,
"learning_rate": 9.326689954164636e-07,
"loss": 1.157,
"step": 9400
},
{
"epoch": 2.8545673076923075,
"grad_norm": 2.5376195907592773,
"learning_rate": 6.412345500583783e-07,
"loss": 1.1587,
"step": 9500
},
{
"epoch": 2.8846153846153846,
"grad_norm": 2.3610806465148926,
"learning_rate": 4.0395543778020686e-07,
"loss": 1.1494,
"step": 9600
},
{
"epoch": 2.9146634615384617,
"grad_norm": 2.4964439868927,
"learning_rate": 2.210919959149682e-07,
"loss": 1.1545,
"step": 9700
},
{
"epoch": 2.9447115384615383,
"grad_norm": 2.600806951522827,
"learning_rate": 9.284485813962906e-08,
"loss": 1.1585,
"step": 9800
},
{
"epoch": 2.9747596153846154,
"grad_norm": 2.45125412940979,
"learning_rate": 1.9354734344295687e-08,
"loss": 1.1554,
"step": 9900
}
],
"logging_steps": 100,
"max_steps": 9984,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.0448513983410995e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}