Marcus2112's picture
Upload folder using huggingface_hub
29ca256 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.119799451652303,
"eval_steps": 100,
"global_step": 1024,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010932788971549125,
"grad_norm": 167.07713317871094,
"learning_rate": 0.0005999985601583006,
"loss": 9.6698,
"step": 10
},
{
"epoch": 0.02186557794309825,
"grad_norm": 83.92709350585938,
"learning_rate": 0.0005998257958771109,
"loss": 8.2484,
"step": 20
},
{
"epoch": 0.032798366914647374,
"grad_norm": 127.91200256347656,
"learning_rate": 0.0005993652532642609,
"loss": 7.6452,
"step": 30
},
{
"epoch": 0.0437311558861965,
"grad_norm": 97.33670043945312,
"learning_rate": 0.0005986173743570491,
"loss": 7.4548,
"step": 40
},
{
"epoch": 0.054663944857745624,
"grad_norm": 127.0005874633789,
"learning_rate": 0.0005975828769834513,
"loss": 7.3226,
"step": 50
},
{
"epoch": 0.06559673382929475,
"grad_norm": 104.47633361816406,
"learning_rate": 0.0005962627540731365,
"loss": 7.204,
"step": 60
},
{
"epoch": 0.07652952280084388,
"grad_norm": 164.4477081298828,
"learning_rate": 0.0005946582727044349,
"loss": 7.1105,
"step": 70
},
{
"epoch": 0.087462311772393,
"grad_norm": 126.8350601196289,
"learning_rate": 0.0005927709728881719,
"loss": 7.0511,
"step": 80
},
{
"epoch": 0.09839510074394213,
"grad_norm": 158.55856323242188,
"learning_rate": 0.0005906026660895383,
"loss": 7.0642,
"step": 90
},
{
"epoch": 0.10932788971549125,
"grad_norm": 126.1555404663086,
"learning_rate": 0.0005881554334894116,
"loss": 7.031,
"step": 100
},
{
"epoch": 0.10932788971549125,
"eval_loss": 7.01555061340332,
"eval_runtime": 79.0984,
"eval_samples_per_second": 118.409,
"eval_steps_per_second": 14.804,
"step": 100
},
{
"epoch": 0.12026067868704038,
"grad_norm": 108.58393096923828,
"learning_rate": 0.0005854316239868012,
"loss": 7.0123,
"step": 110
},
{
"epoch": 0.1311934676585895,
"grad_norm": 178.0326690673828,
"learning_rate": 0.0005824338519443309,
"loss": 6.9897,
"step": 120
},
{
"epoch": 0.14212625663013861,
"grad_norm": 192.8655242919922,
"learning_rate": 0.0005791649946789259,
"loss": 7.0117,
"step": 130
},
{
"epoch": 0.15305904560168776,
"grad_norm": 143.3759002685547,
"learning_rate": 0.0005756281897001107,
"loss": 7.0073,
"step": 140
},
{
"epoch": 0.16399183457323688,
"grad_norm": 171.0679168701172,
"learning_rate": 0.0005718268316985698,
"loss": 6.9843,
"step": 150
},
{
"epoch": 0.174924623544786,
"grad_norm": 164.86534118652344,
"learning_rate": 0.0005677645692878606,
"loss": 7.0083,
"step": 160
},
{
"epoch": 0.1858574125163351,
"grad_norm": 125.85225677490234,
"learning_rate": 0.000563445301502407,
"loss": 7.02,
"step": 170
},
{
"epoch": 0.19679020148788426,
"grad_norm": 144.15589904785156,
"learning_rate": 0.0005588731740551344,
"loss": 6.9773,
"step": 180
},
{
"epoch": 0.20772299045943338,
"grad_norm": 108.05564880371094,
"learning_rate": 0.0005540525753583378,
"loss": 6.9632,
"step": 190
},
{
"epoch": 0.2186557794309825,
"grad_norm": 146.53924560546875,
"learning_rate": 0.0005489881323116018,
"loss": 6.929,
"step": 200
},
{
"epoch": 0.2186557794309825,
"eval_loss": 6.925621509552002,
"eval_runtime": 78.9467,
"eval_samples_per_second": 118.637,
"eval_steps_per_second": 14.833,
"step": 200
},
{
"epoch": 0.2295885684025316,
"grad_norm": 204.57968139648438,
"learning_rate": 0.0005436847058608189,
"loss": 6.9631,
"step": 210
},
{
"epoch": 0.24052135737408076,
"grad_norm": 171.31556701660156,
"learning_rate": 0.0005381473863325621,
"loss": 7.0389,
"step": 220
},
{
"epoch": 0.25145414634562985,
"grad_norm": 142.57449340820312,
"learning_rate": 0.0005323814885482963,
"loss": 6.967,
"step": 230
},
{
"epoch": 0.262386935317179,
"grad_norm": 119.19646453857422,
"learning_rate": 0.000526392546723115,
"loss": 6.9456,
"step": 240
},
{
"epoch": 0.27331972428872814,
"grad_norm": 153.62359619140625,
"learning_rate": 0.0005201863091538979,
"loss": 6.9686,
"step": 250
},
{
"epoch": 0.28425251326027723,
"grad_norm": 150.35699462890625,
"learning_rate": 0.000513768732701989,
"loss": 6.9846,
"step": 260
},
{
"epoch": 0.2951853022318264,
"grad_norm": 215.55368041992188,
"learning_rate": 0.0005071459770756929,
"loss": 6.9968,
"step": 270
},
{
"epoch": 0.3061180912033755,
"grad_norm": 107.55154418945312,
"learning_rate": 0.0005003243989180711,
"loss": 7.0033,
"step": 280
},
{
"epoch": 0.3170508801749246,
"grad_norm": 190.4154052734375,
"learning_rate": 0.0004933105457057203,
"loss": 6.9816,
"step": 290
},
{
"epoch": 0.32798366914647376,
"grad_norm": 159.7703094482422,
"learning_rate": 0.0004861111494643821,
"loss": 7.0486,
"step": 300
},
{
"epoch": 0.32798366914647376,
"eval_loss": 7.4869384765625,
"eval_runtime": 79.1717,
"eval_samples_per_second": 118.3,
"eval_steps_per_second": 14.791,
"step": 300
},
{
"epoch": 0.3389164581180229,
"grad_norm": 218.22604370117188,
"learning_rate": 0.0004794787611927562,
"loss": 7.2679,
"step": 310
},
{
"epoch": 0.349849247089572,
"grad_norm": 182.51431274414062,
"learning_rate": 0.0004719460124060748,
"loss": 7.1809,
"step": 320
},
{
"epoch": 0.36078203606112114,
"grad_norm": 137.0953826904297,
"learning_rate": 0.0004642482266637136,
"loss": 7.0417,
"step": 330
},
{
"epoch": 0.3717148250326702,
"grad_norm": 92.07840728759766,
"learning_rate": 0.0004563927924424775,
"loss": 6.9309,
"step": 340
},
{
"epoch": 0.38264761400421937,
"grad_norm": 147.35975646972656,
"learning_rate": 0.00044838724953309093,
"loss": 6.8844,
"step": 350
},
{
"epoch": 0.3935804029757685,
"grad_norm": 262.996337890625,
"learning_rate": 0.0004402392818033671,
"loss": 6.966,
"step": 360
},
{
"epoch": 0.4045131919473176,
"grad_norm": 155.3452606201172,
"learning_rate": 0.00043195670982308984,
"loss": 7.0715,
"step": 370
},
{
"epoch": 0.41544598091886675,
"grad_norm": 129.5069580078125,
"learning_rate": 0.00042354748335768664,
"loss": 7.0806,
"step": 380
},
{
"epoch": 0.4263787698904159,
"grad_norm": 92.96502685546875,
"learning_rate": 0.0004150196737378971,
"loss": 6.9999,
"step": 390
},
{
"epoch": 0.437311558861965,
"grad_norm": 120.41193389892578,
"learning_rate": 0.0004063814661127606,
"loss": 6.9339,
"step": 400
},
{
"epoch": 0.437311558861965,
"eval_loss": 6.931961536407471,
"eval_runtime": 78.8373,
"eval_samples_per_second": 118.802,
"eval_steps_per_second": 14.853,
"step": 400
},
{
"epoch": 0.44824434783351413,
"grad_norm": 188.7049560546875,
"learning_rate": 0.00039764115159335935,
"loss": 6.9242,
"step": 410
},
{
"epoch": 0.4591771368050632,
"grad_norm": 131.7518768310547,
"learning_rate": 0.0003888071192948565,
"loss": 6.9815,
"step": 420
},
{
"epoch": 0.47010992577661237,
"grad_norm": 247.91549682617188,
"learning_rate": 0.0003798878482844695,
"loss": 7.0838,
"step": 430
},
{
"epoch": 0.4810427147481615,
"grad_norm": 135.4517364501953,
"learning_rate": 0.000370891899443104,
"loss": 7.1813,
"step": 440
},
{
"epoch": 0.4919755037197106,
"grad_norm": 99.5172119140625,
"learning_rate": 0.00036182790724846315,
"loss": 7.1557,
"step": 450
},
{
"epoch": 0.5029082926912597,
"grad_norm": 165.1914825439453,
"learning_rate": 0.00035270457148751575,
"loss": 7.0382,
"step": 460
},
{
"epoch": 0.5138410816628088,
"grad_norm": 128.59959411621094,
"learning_rate": 0.00034353064890628107,
"loss": 7.0597,
"step": 470
},
{
"epoch": 0.524773870634358,
"grad_norm": 142.37147521972656,
"learning_rate": 0.00033431494480494175,
"loss": 7.092,
"step": 480
},
{
"epoch": 0.5357066596059071,
"grad_norm": 217.4059295654297,
"learning_rate": 0.0003250663045863544,
"loss": 7.0457,
"step": 490
},
{
"epoch": 0.5466394485774563,
"grad_norm": 125.81988525390625,
"learning_rate": 0.0003157936052660688,
"loss": 7.0112,
"step": 500
},
{
"epoch": 0.5466394485774563,
"eval_loss": 7.004736423492432,
"eval_runtime": 78.8041,
"eval_samples_per_second": 118.852,
"eval_steps_per_second": 14.86,
"step": 500
},
{
"epoch": 0.5575722375490054,
"grad_norm": 170.00523376464844,
"learning_rate": 0.0003065057469520046,
"loss": 7.0162,
"step": 510
},
{
"epoch": 0.5685050265205545,
"grad_norm": 216.81466674804688,
"learning_rate": 0.0002972116443019633,
"loss": 7.0584,
"step": 520
},
{
"epoch": 0.5794378154921036,
"grad_norm": 239.21087646484375,
"learning_rate": 0.0002879202179671755,
"loss": 7.1254,
"step": 530
},
{
"epoch": 0.5903706044636527,
"grad_norm": 190.0070343017578,
"learning_rate": 0.00027864038603009453,
"loss": 7.1717,
"step": 540
},
{
"epoch": 0.6013033934352019,
"grad_norm": 179.18785095214844,
"learning_rate": 0.00026938105544465745,
"loss": 7.1185,
"step": 550
},
{
"epoch": 0.612236182406751,
"grad_norm": 279.44781494140625,
"learning_rate": 0.0002601511134872255,
"loss": 7.0727,
"step": 560
},
{
"epoch": 0.6231689713783001,
"grad_norm": 227.90072631835938,
"learning_rate": 0.0002509594192264121,
"loss": 7.1088,
"step": 570
},
{
"epoch": 0.6341017603498492,
"grad_norm": 173.11819458007812,
"learning_rate": 0.0002418147950199862,
"loss": 7.0927,
"step": 580
},
{
"epoch": 0.6450345493213984,
"grad_norm": 164.40736389160156,
"learning_rate": 0.00023272601804700946,
"loss": 7.0701,
"step": 590
},
{
"epoch": 0.6559673382929475,
"grad_norm": 123.35533142089844,
"learning_rate": 0.0002237018118833387,
"loss": 7.0496,
"step": 600
},
{
"epoch": 0.6559673382929475,
"eval_loss": 7.052866458892822,
"eval_runtime": 78.8887,
"eval_samples_per_second": 118.724,
"eval_steps_per_second": 14.844,
"step": 600
},
{
"epoch": 0.6669001272644967,
"grad_norm": 225.67015075683594,
"learning_rate": 0.0002147508381285762,
"loss": 7.04,
"step": 610
},
{
"epoch": 0.6778329162360458,
"grad_norm": 140.2364501953125,
"learning_rate": 0.00020588168809250687,
"loss": 7.0902,
"step": 620
},
{
"epoch": 0.6887657052075948,
"grad_norm": 262.8550720214844,
"learning_rate": 0.00019710287454900033,
"loss": 7.1224,
"step": 630
},
{
"epoch": 0.699698494179144,
"grad_norm": 150.97813415527344,
"learning_rate": 0.00018842282356529402,
"loss": 7.1802,
"step": 640
},
{
"epoch": 0.7106312831506931,
"grad_norm": 452.73431396484375,
"learning_rate": 0.00017984986641449754,
"loss": 7.1497,
"step": 650
},
{
"epoch": 0.7215640721222423,
"grad_norm": 138.37220764160156,
"learning_rate": 0.00017139223157908368,
"loss": 7.1715,
"step": 660
},
{
"epoch": 0.7324968610937914,
"grad_norm": 144.21133422851562,
"learning_rate": 0.00016305803685303906,
"loss": 7.1458,
"step": 670
},
{
"epoch": 0.7434296500653405,
"grad_norm": 142.4859161376953,
"learning_rate": 0.00015485528155025473,
"loss": 7.1041,
"step": 680
},
{
"epoch": 0.7543624390368896,
"grad_norm": 190.189208984375,
"learning_rate": 0.00014679183882663872,
"loss": 7.0798,
"step": 690
},
{
"epoch": 0.7652952280084387,
"grad_norm": 160.14442443847656,
"learning_rate": 0.0001388754481233139,
"loss": 7.074,
"step": 700
},
{
"epoch": 0.7652952280084387,
"eval_loss": 7.0790934562683105,
"eval_runtime": 79.0053,
"eval_samples_per_second": 118.549,
"eval_steps_per_second": 14.822,
"step": 700
},
{
"epoch": 0.7762280169799879,
"grad_norm": 173.01499938964844,
"learning_rate": 0.0001311137077381614,
"loss": 7.0821,
"step": 710
},
{
"epoch": 0.787160805951537,
"grad_norm": 156.1138458251953,
"learning_rate": 0.00012351406753283216,
"loss": 7.0838,
"step": 720
},
{
"epoch": 0.7980935949230861,
"grad_norm": 161.9981689453125,
"learning_rate": 0.00011681901904809884,
"loss": 7.0639,
"step": 730
},
{
"epoch": 0.8090263838946352,
"grad_norm": 174.0237579345703,
"learning_rate": 0.00010954733067505213,
"loss": 7.0604,
"step": 740
},
{
"epoch": 0.8199591728661844,
"grad_norm": 141.823974609375,
"learning_rate": 0.0001024584422885053,
"loss": 7.0508,
"step": 750
},
{
"epoch": 0.8308919618377335,
"grad_norm": 121.39106750488281,
"learning_rate": 9.555915793434476e-05,
"loss": 7.0568,
"step": 760
},
{
"epoch": 0.8418247508092827,
"grad_norm": 178.37924194335938,
"learning_rate": 8.885609967300851e-05,
"loss": 7.0589,
"step": 770
},
{
"epoch": 0.8527575397808318,
"grad_norm": 304.8969421386719,
"learning_rate": 8.235570122350937e-05,
"loss": 7.0582,
"step": 780
},
{
"epoch": 0.8636903287523808,
"grad_norm": 128.75843811035156,
"learning_rate": 7.606420178823293e-05,
"loss": 7.0622,
"step": 790
},
{
"epoch": 0.87462311772393,
"grad_norm": 88.88775634765625,
"learning_rate": 6.998764006443615e-05,
"loss": 7.0664,
"step": 800
},
{
"epoch": 0.87462311772393,
"eval_loss": 7.048069477081299,
"eval_runtime": 78.7086,
"eval_samples_per_second": 118.996,
"eval_steps_per_second": 14.878,
"step": 800
},
{
"epoch": 0.8855559066954791,
"grad_norm": 131.33584594726562,
"learning_rate": 6.413184844819423e-05,
"loss": 7.0381,
"step": 810
},
{
"epoch": 0.8964886956670283,
"grad_norm": 176.8515625,
"learning_rate": 6e-05,
"loss": 7.0461,
"step": 820
},
{
"epoch": 0.9074214846385774,
"grad_norm": 128.32069396972656,
"learning_rate": 6e-05,
"loss": 7.0597,
"step": 830
},
{
"epoch": 0.9183542736101264,
"grad_norm": 150.107421875,
"learning_rate": 6e-05,
"loss": 7.0582,
"step": 840
},
{
"epoch": 0.9292870625816756,
"grad_norm": 174.95352172851562,
"learning_rate": 6e-05,
"loss": 7.0729,
"step": 850
},
{
"epoch": 0.9402198515532247,
"grad_norm": 209.878173828125,
"learning_rate": 6e-05,
"loss": 7.0949,
"step": 860
},
{
"epoch": 0.9511526405247739,
"grad_norm": 181.1326904296875,
"learning_rate": 6e-05,
"loss": 7.109,
"step": 870
},
{
"epoch": 0.962085429496323,
"grad_norm": 197.11639404296875,
"learning_rate": 6e-05,
"loss": 7.1132,
"step": 880
},
{
"epoch": 0.9730182184678722,
"grad_norm": 197.16473388671875,
"learning_rate": 6e-05,
"loss": 7.1008,
"step": 890
},
{
"epoch": 0.9839510074394212,
"grad_norm": 224.1211395263672,
"learning_rate": 6e-05,
"loss": 7.1024,
"step": 900
},
{
"epoch": 0.9839510074394212,
"eval_loss": 7.119234561920166,
"eval_runtime": 78.4745,
"eval_samples_per_second": 119.351,
"eval_steps_per_second": 14.922,
"step": 900
},
{
"epoch": 0.9948837964109704,
"grad_norm": 161.86753845214844,
"learning_rate": 6e-05,
"loss": 7.1127,
"step": 910
},
{
"epoch": 1.0060984463481923,
"grad_norm": 247.6467742919922,
"learning_rate": 6e-05,
"loss": 7.1115,
"step": 920
},
{
"epoch": 1.0170312353197413,
"grad_norm": 228.1467742919922,
"learning_rate": 6e-05,
"loss": 7.1172,
"step": 930
},
{
"epoch": 1.0279640242912904,
"grad_norm": 400.675537109375,
"learning_rate": 6e-05,
"loss": 7.1351,
"step": 940
},
{
"epoch": 1.0388968132628396,
"grad_norm": 293.3075866699219,
"learning_rate": 6e-05,
"loss": 7.1747,
"step": 950
},
{
"epoch": 1.0498296022343887,
"grad_norm": 439.60760498046875,
"learning_rate": 6e-05,
"loss": 7.1955,
"step": 960
},
{
"epoch": 1.0607623912059378,
"grad_norm": 336.15521240234375,
"learning_rate": 6e-05,
"loss": 7.2134,
"step": 970
},
{
"epoch": 1.071695180177487,
"grad_norm": 232.90606689453125,
"learning_rate": 6e-05,
"loss": 7.2589,
"step": 980
},
{
"epoch": 1.0826279691490361,
"grad_norm": 453.7010803222656,
"learning_rate": 6e-05,
"loss": 7.2537,
"step": 990
},
{
"epoch": 1.0935607581205853,
"grad_norm": 156.7413330078125,
"learning_rate": 6e-05,
"loss": 7.2678,
"step": 1000
},
{
"epoch": 1.0935607581205853,
"eval_loss": 7.271553993225098,
"eval_runtime": 78.5452,
"eval_samples_per_second": 119.243,
"eval_steps_per_second": 14.909,
"step": 1000
},
{
"epoch": 1.1044935470921344,
"grad_norm": 225.9600067138672,
"learning_rate": 6e-05,
"loss": 7.2489,
"step": 1010
},
{
"epoch": 1.1154263360636836,
"grad_norm": 258.6958312988281,
"learning_rate": 6e-05,
"loss": 7.2224,
"step": 1020
}
],
"logging_steps": 10,
"max_steps": 1024,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1024,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.383804151351214e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}