|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 200, |
|
"global_step": 292, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00684931506849315, |
|
"grad_norm": 0.030539813126927277, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.1076, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0136986301369863, |
|
"grad_norm": 0.03492658058904307, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.1246, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.02054794520547945, |
|
"grad_norm": 0.04865725802068095, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1586, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0273972602739726, |
|
"grad_norm": 0.05113567210987629, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.1514, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03424657534246575, |
|
"grad_norm": 0.04332482337715696, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.1334, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0410958904109589, |
|
"grad_norm": 0.056501517819200174, |
|
"learning_rate": 2e-05, |
|
"loss": 0.162, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04794520547945205, |
|
"grad_norm": 0.02608028374746435, |
|
"learning_rate": 2.3333333333333336e-05, |
|
"loss": 0.1031, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0547945205479452, |
|
"grad_norm": 0.03653546691009632, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 0.1047, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06164383561643835, |
|
"grad_norm": 0.036654425448601695, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1063, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0684931506849315, |
|
"grad_norm": 0.07120435117331021, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.1899, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07534246575342465, |
|
"grad_norm": 0.052020978283898525, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 0.1369, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0821917808219178, |
|
"grad_norm": 0.05373108737583589, |
|
"learning_rate": 4e-05, |
|
"loss": 0.1232, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08904109589041095, |
|
"grad_norm": 0.04869530777109541, |
|
"learning_rate": 4.3333333333333334e-05, |
|
"loss": 0.1137, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0958904109589041, |
|
"grad_norm": 0.05909792067471564, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 0.1211, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.10273972602739725, |
|
"grad_norm": 0.07782073062487518, |
|
"learning_rate": 5e-05, |
|
"loss": 0.1419, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1095890410958904, |
|
"grad_norm": 0.10068121599027634, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 0.1573, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11643835616438356, |
|
"grad_norm": 0.0794018071448985, |
|
"learning_rate": 5.666666666666667e-05, |
|
"loss": 0.1368, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.1232876712328767, |
|
"grad_norm": 0.03925546794880312, |
|
"learning_rate": 6e-05, |
|
"loss": 0.0936, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.13013698630136986, |
|
"grad_norm": 0.09398334893562084, |
|
"learning_rate": 6.333333333333333e-05, |
|
"loss": 0.1405, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.136986301369863, |
|
"grad_norm": 0.03366472291998534, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.0939, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.14383561643835616, |
|
"grad_norm": 0.03002583884373123, |
|
"learning_rate": 7e-05, |
|
"loss": 0.0857, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1506849315068493, |
|
"grad_norm": 0.04526244411827793, |
|
"learning_rate": 7.333333333333333e-05, |
|
"loss": 0.0932, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.15753424657534246, |
|
"grad_norm": 0.0271467232481723, |
|
"learning_rate": 7.666666666666667e-05, |
|
"loss": 0.0926, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1643835616438356, |
|
"grad_norm": 0.036006533907184064, |
|
"learning_rate": 8e-05, |
|
"loss": 0.1092, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.17123287671232876, |
|
"grad_norm": 0.03819263334586657, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 0.0967, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1780821917808219, |
|
"grad_norm": 0.04732574961540268, |
|
"learning_rate": 8.666666666666667e-05, |
|
"loss": 0.0933, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.18493150684931506, |
|
"grad_norm": 0.04933244373512878, |
|
"learning_rate": 9e-05, |
|
"loss": 0.0964, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.1917808219178082, |
|
"grad_norm": 0.05164570324383495, |
|
"learning_rate": 9.333333333333334e-05, |
|
"loss": 0.0893, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.19863013698630136, |
|
"grad_norm": 0.06506455363157941, |
|
"learning_rate": 9.666666666666667e-05, |
|
"loss": 0.1099, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.2054794520547945, |
|
"grad_norm": 0.05004738114863798, |
|
"learning_rate": 0.0001, |
|
"loss": 0.093, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.21232876712328766, |
|
"grad_norm": 0.058372907031022096, |
|
"learning_rate": 9.999640555396404e-05, |
|
"loss": 0.0859, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2191780821917808, |
|
"grad_norm": 0.04214183460112646, |
|
"learning_rate": 9.998562273265785e-05, |
|
"loss": 0.0757, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.22602739726027396, |
|
"grad_norm": 0.045590856190952045, |
|
"learning_rate": 9.996765308641218e-05, |
|
"loss": 0.0909, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2328767123287671, |
|
"grad_norm": 0.03615919142026033, |
|
"learning_rate": 9.994249919886402e-05, |
|
"loss": 0.0773, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.23972602739726026, |
|
"grad_norm": 0.053734428778710715, |
|
"learning_rate": 9.991016468658499e-05, |
|
"loss": 0.0804, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2465753424657534, |
|
"grad_norm": 0.03350548900671173, |
|
"learning_rate": 9.98706541985615e-05, |
|
"loss": 0.0949, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2534246575342466, |
|
"grad_norm": 0.027043847680669336, |
|
"learning_rate": 9.98239734155262e-05, |
|
"loss": 0.0689, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2602739726027397, |
|
"grad_norm": 0.03089668501975604, |
|
"learning_rate": 9.977012904914133e-05, |
|
"loss": 0.0896, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2671232876712329, |
|
"grad_norm": 0.029975461306757812, |
|
"learning_rate": 9.970912884103364e-05, |
|
"loss": 0.0818, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.273972602739726, |
|
"grad_norm": 0.029112303333024562, |
|
"learning_rate": 9.964098156168142e-05, |
|
"loss": 0.0764, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2808219178082192, |
|
"grad_norm": 0.03609528093820637, |
|
"learning_rate": 9.956569700915337e-05, |
|
"loss": 0.0896, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2876712328767123, |
|
"grad_norm": 0.02923285599907434, |
|
"learning_rate": 9.948328600769995e-05, |
|
"loss": 0.0687, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2945205479452055, |
|
"grad_norm": 0.03722889238353647, |
|
"learning_rate": 9.939376040619705e-05, |
|
"loss": 0.0904, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3013698630136986, |
|
"grad_norm": 0.032059757320644165, |
|
"learning_rate": 9.929713307644244e-05, |
|
"loss": 0.0834, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3082191780821918, |
|
"grad_norm": 0.04817854262074185, |
|
"learning_rate": 9.919341791130496e-05, |
|
"loss": 0.0913, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3150684931506849, |
|
"grad_norm": 0.027208193309115385, |
|
"learning_rate": 9.908262982272724e-05, |
|
"loss": 0.0783, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3219178082191781, |
|
"grad_norm": 0.03396529679519977, |
|
"learning_rate": 9.896478473958146e-05, |
|
"loss": 0.0788, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3287671232876712, |
|
"grad_norm": 0.03703807434511364, |
|
"learning_rate": 9.883989960537933e-05, |
|
"loss": 0.0737, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3356164383561644, |
|
"grad_norm": 0.037750701903354035, |
|
"learning_rate": 9.870799237583587e-05, |
|
"loss": 0.0879, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3424657534246575, |
|
"grad_norm": 0.0421005656051737, |
|
"learning_rate": 9.85690820162878e-05, |
|
"loss": 0.0762, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3493150684931507, |
|
"grad_norm": 0.02733701424368454, |
|
"learning_rate": 9.842318849896679e-05, |
|
"loss": 0.0716, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3561643835616438, |
|
"grad_norm": 0.027664399030788704, |
|
"learning_rate": 9.827033280012783e-05, |
|
"loss": 0.0783, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.363013698630137, |
|
"grad_norm": 0.030867201947854212, |
|
"learning_rate": 9.811053689703334e-05, |
|
"loss": 0.0895, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3698630136986301, |
|
"grad_norm": 0.02445456343322928, |
|
"learning_rate": 9.794382376479334e-05, |
|
"loss": 0.0669, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3767123287671233, |
|
"grad_norm": 0.0355725180623145, |
|
"learning_rate": 9.777021737306214e-05, |
|
"loss": 0.0758, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3835616438356164, |
|
"grad_norm": 0.02787477968018966, |
|
"learning_rate": 9.7589742682592e-05, |
|
"loss": 0.0671, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3904109589041096, |
|
"grad_norm": 0.02917250086551862, |
|
"learning_rate": 9.740242564164434e-05, |
|
"loss": 0.0777, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3972602739726027, |
|
"grad_norm": 0.027367394927186293, |
|
"learning_rate": 9.720829318225897e-05, |
|
"loss": 0.0784, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.4041095890410959, |
|
"grad_norm": 0.03404049628582292, |
|
"learning_rate": 9.700737321638185e-05, |
|
"loss": 0.0831, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.410958904109589, |
|
"grad_norm": 0.03007296844683444, |
|
"learning_rate": 9.6799694631852e-05, |
|
"loss": 0.0666, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4178082191780822, |
|
"grad_norm": 0.02540452547965842, |
|
"learning_rate": 9.6585287288248e-05, |
|
"loss": 0.0661, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4246575342465753, |
|
"grad_norm": 0.026781782354160222, |
|
"learning_rate": 9.63641820125949e-05, |
|
"loss": 0.074, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4315068493150685, |
|
"grad_norm": 0.026783892217477907, |
|
"learning_rate": 9.613641059493197e-05, |
|
"loss": 0.066, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4383561643835616, |
|
"grad_norm": 0.03124681883629973, |
|
"learning_rate": 9.590200578374198e-05, |
|
"loss": 0.0704, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.4452054794520548, |
|
"grad_norm": 0.03417198666061255, |
|
"learning_rate": 9.56610012812427e-05, |
|
"loss": 0.0809, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4520547945205479, |
|
"grad_norm": 0.03459134201897623, |
|
"learning_rate": 9.541343173854127e-05, |
|
"loss": 0.0775, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4589041095890411, |
|
"grad_norm": 0.028846468217507328, |
|
"learning_rate": 9.515933275065219e-05, |
|
"loss": 0.0683, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4657534246575342, |
|
"grad_norm": 0.02841488216076331, |
|
"learning_rate": 9.48987408513794e-05, |
|
"loss": 0.0682, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.4726027397260274, |
|
"grad_norm": 0.038470702709475786, |
|
"learning_rate": 9.463169350806369e-05, |
|
"loss": 0.1041, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.4794520547945205, |
|
"grad_norm": 0.05495636003850682, |
|
"learning_rate": 9.435822911619564e-05, |
|
"loss": 0.097, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4863013698630137, |
|
"grad_norm": 0.03142706228563173, |
|
"learning_rate": 9.407838699389524e-05, |
|
"loss": 0.0844, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4931506849315068, |
|
"grad_norm": 0.02769595486440407, |
|
"learning_rate": 9.379220737625877e-05, |
|
"loss": 0.0648, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.029889370715744733, |
|
"learning_rate": 9.34997314095739e-05, |
|
"loss": 0.0668, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5068493150684932, |
|
"grad_norm": 0.036383935533411724, |
|
"learning_rate": 9.320100114540382e-05, |
|
"loss": 0.0676, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5136986301369864, |
|
"grad_norm": 0.05144060226911856, |
|
"learning_rate": 9.289605953454107e-05, |
|
"loss": 0.0834, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5205479452054794, |
|
"grad_norm": 0.029231837833789662, |
|
"learning_rate": 9.258495042083221e-05, |
|
"loss": 0.067, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5273972602739726, |
|
"grad_norm": 0.030385743404608083, |
|
"learning_rate": 9.22677185348741e-05, |
|
"loss": 0.0809, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5342465753424658, |
|
"grad_norm": 0.031195055928310497, |
|
"learning_rate": 9.19444094875825e-05, |
|
"loss": 0.0735, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.541095890410959, |
|
"grad_norm": 0.029330760873632865, |
|
"learning_rate": 9.161506976363437e-05, |
|
"loss": 0.0736, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.547945205479452, |
|
"grad_norm": 0.0332541385460945, |
|
"learning_rate": 9.127974671478432e-05, |
|
"loss": 0.0812, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5547945205479452, |
|
"grad_norm": 0.03305866284735081, |
|
"learning_rate": 9.093848855305649e-05, |
|
"loss": 0.0822, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5616438356164384, |
|
"grad_norm": 0.03432894975247522, |
|
"learning_rate": 9.059134434381273e-05, |
|
"loss": 0.0783, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5684931506849316, |
|
"grad_norm": 0.039182983364365015, |
|
"learning_rate": 9.023836399869814e-05, |
|
"loss": 0.0815, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5753424657534246, |
|
"grad_norm": 0.02912164254908111, |
|
"learning_rate": 8.98795982684648e-05, |
|
"loss": 0.0715, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5821917808219178, |
|
"grad_norm": 0.029959446654787666, |
|
"learning_rate": 8.951509873567499e-05, |
|
"loss": 0.0754, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.589041095890411, |
|
"grad_norm": 0.02955747281592669, |
|
"learning_rate": 8.914491780728471e-05, |
|
"loss": 0.0683, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5958904109589042, |
|
"grad_norm": 0.03207481946898706, |
|
"learning_rate": 8.876910870710884e-05, |
|
"loss": 0.0725, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6027397260273972, |
|
"grad_norm": 0.024590915698935577, |
|
"learning_rate": 8.838772546816856e-05, |
|
"loss": 0.0569, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.6095890410958904, |
|
"grad_norm": 0.030750251732919592, |
|
"learning_rate": 8.800082292492273e-05, |
|
"loss": 0.0732, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.6164383561643836, |
|
"grad_norm": 0.032945297540624904, |
|
"learning_rate": 8.760845670538387e-05, |
|
"loss": 0.0787, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6232876712328768, |
|
"grad_norm": 0.03778017209196359, |
|
"learning_rate": 8.721068322312007e-05, |
|
"loss": 0.0835, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.6301369863013698, |
|
"grad_norm": 0.03507952393071719, |
|
"learning_rate": 8.680755966914401e-05, |
|
"loss": 0.0785, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.636986301369863, |
|
"grad_norm": 0.03248195299567112, |
|
"learning_rate": 8.639914400369009e-05, |
|
"loss": 0.0769, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6438356164383562, |
|
"grad_norm": 0.03731653007913649, |
|
"learning_rate": 8.598549494788111e-05, |
|
"loss": 0.0863, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6506849315068494, |
|
"grad_norm": 0.03180106946934432, |
|
"learning_rate": 8.556667197528543e-05, |
|
"loss": 0.0601, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6575342465753424, |
|
"grad_norm": 0.03375769439746087, |
|
"learning_rate": 8.5142735303366e-05, |
|
"loss": 0.0833, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6643835616438356, |
|
"grad_norm": 0.03122748644136327, |
|
"learning_rate": 8.47137458848224e-05, |
|
"loss": 0.0653, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6712328767123288, |
|
"grad_norm": 0.030566834172214723, |
|
"learning_rate": 8.427976539882724e-05, |
|
"loss": 0.0794, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.678082191780822, |
|
"grad_norm": 0.041027373975817134, |
|
"learning_rate": 8.384085624215801e-05, |
|
"loss": 0.094, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.684931506849315, |
|
"grad_norm": 0.02698160042089123, |
|
"learning_rate": 8.339708152022585e-05, |
|
"loss": 0.06, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6917808219178082, |
|
"grad_norm": 0.026358548855895562, |
|
"learning_rate": 8.294850503800238e-05, |
|
"loss": 0.0586, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6986301369863014, |
|
"grad_norm": 0.029315218243026478, |
|
"learning_rate": 8.24951912908459e-05, |
|
"loss": 0.0659, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.7054794520547946, |
|
"grad_norm": 0.03730243641514126, |
|
"learning_rate": 8.203720545522853e-05, |
|
"loss": 0.0768, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.7123287671232876, |
|
"grad_norm": 0.028421475360968528, |
|
"learning_rate": 8.157461337936506e-05, |
|
"loss": 0.0674, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.7191780821917808, |
|
"grad_norm": 0.036896447275521424, |
|
"learning_rate": 8.110748157374565e-05, |
|
"loss": 0.0813, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.726027397260274, |
|
"grad_norm": 0.03194136933101962, |
|
"learning_rate": 8.063587720157298e-05, |
|
"loss": 0.0661, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.7328767123287672, |
|
"grad_norm": 0.047606415740070754, |
|
"learning_rate": 8.01598680691057e-05, |
|
"loss": 0.0922, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.7397260273972602, |
|
"grad_norm": 0.03982021001322296, |
|
"learning_rate": 7.967952261590935e-05, |
|
"loss": 0.0677, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7465753424657534, |
|
"grad_norm": 0.030844190311457055, |
|
"learning_rate": 7.919490990501636e-05, |
|
"loss": 0.0693, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7534246575342466, |
|
"grad_norm": 0.03128080580199763, |
|
"learning_rate": 7.870609961299627e-05, |
|
"loss": 0.055, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7602739726027398, |
|
"grad_norm": 0.03042131745977429, |
|
"learning_rate": 7.821316201993767e-05, |
|
"loss": 0.0696, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7671232876712328, |
|
"grad_norm": 0.03086792896956372, |
|
"learning_rate": 7.771616799934371e-05, |
|
"loss": 0.0717, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.773972602739726, |
|
"grad_norm": 0.03186550616403287, |
|
"learning_rate": 7.721518900794185e-05, |
|
"loss": 0.0721, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7808219178082192, |
|
"grad_norm": 0.029517902212737826, |
|
"learning_rate": 7.67102970754101e-05, |
|
"loss": 0.068, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7876712328767124, |
|
"grad_norm": 0.03470530857876834, |
|
"learning_rate": 7.620156479402066e-05, |
|
"loss": 0.0687, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7945205479452054, |
|
"grad_norm": 0.032497402299353856, |
|
"learning_rate": 7.568906530820282e-05, |
|
"loss": 0.0749, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.8013698630136986, |
|
"grad_norm": 0.03286496346308627, |
|
"learning_rate": 7.517287230402639e-05, |
|
"loss": 0.0769, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.8082191780821918, |
|
"grad_norm": 0.029927285174734974, |
|
"learning_rate": 7.465305999860728e-05, |
|
"loss": 0.0681, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.815068493150685, |
|
"grad_norm": 0.03427786976795219, |
|
"learning_rate": 7.412970312943671e-05, |
|
"loss": 0.0777, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.821917808219178, |
|
"grad_norm": 0.031851819549739036, |
|
"learning_rate": 7.360287694363566e-05, |
|
"loss": 0.0653, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8287671232876712, |
|
"grad_norm": 0.03126182717380993, |
|
"learning_rate": 7.30726571871359e-05, |
|
"loss": 0.0741, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.8356164383561644, |
|
"grad_norm": 0.03232123010558018, |
|
"learning_rate": 7.253912009378953e-05, |
|
"loss": 0.0622, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.8424657534246576, |
|
"grad_norm": 0.031930228246767285, |
|
"learning_rate": 7.200234237440815e-05, |
|
"loss": 0.0711, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8493150684931506, |
|
"grad_norm": 0.03869729803850467, |
|
"learning_rate": 7.146240120573358e-05, |
|
"loss": 0.0748, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8561643835616438, |
|
"grad_norm": 0.04668355562414557, |
|
"learning_rate": 7.091937421934157e-05, |
|
"loss": 0.0719, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.863013698630137, |
|
"grad_norm": 0.045196133927728935, |
|
"learning_rate": 7.037333949048005e-05, |
|
"loss": 0.0801, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8698630136986302, |
|
"grad_norm": 0.03140632320538256, |
|
"learning_rate": 6.98243755268437e-05, |
|
"loss": 0.0626, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.8767123287671232, |
|
"grad_norm": 0.052496609882496145, |
|
"learning_rate": 6.927256125728624e-05, |
|
"loss": 0.0777, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8835616438356164, |
|
"grad_norm": 0.03868077359004108, |
|
"learning_rate": 6.87179760204722e-05, |
|
"loss": 0.0791, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.8904109589041096, |
|
"grad_norm": 0.0348172699895961, |
|
"learning_rate": 6.816069955346985e-05, |
|
"loss": 0.0743, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8972602739726028, |
|
"grad_norm": 0.029728370506506084, |
|
"learning_rate": 6.760081198028671e-05, |
|
"loss": 0.062, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.9041095890410958, |
|
"grad_norm": 0.033304716743838475, |
|
"learning_rate": 6.703839380034946e-05, |
|
"loss": 0.0742, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.910958904109589, |
|
"grad_norm": 0.03402220621737934, |
|
"learning_rate": 6.647352587693001e-05, |
|
"loss": 0.074, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.9178082191780822, |
|
"grad_norm": 0.03029297413232278, |
|
"learning_rate": 6.590628942551909e-05, |
|
"loss": 0.0706, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.9246575342465754, |
|
"grad_norm": 0.03726620583459937, |
|
"learning_rate": 6.533676600214928e-05, |
|
"loss": 0.0711, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.9315068493150684, |
|
"grad_norm": 0.030791726162161644, |
|
"learning_rate": 6.476503749166904e-05, |
|
"loss": 0.0844, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.9383561643835616, |
|
"grad_norm": 0.03101593164912545, |
|
"learning_rate": 6.419118609596948e-05, |
|
"loss": 0.0743, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.9452054794520548, |
|
"grad_norm": 0.037483812819514725, |
|
"learning_rate": 6.361529432216559e-05, |
|
"loss": 0.0888, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.952054794520548, |
|
"grad_norm": 0.034654276627226706, |
|
"learning_rate": 6.303744497073352e-05, |
|
"loss": 0.0662, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.958904109589041, |
|
"grad_norm": 0.10049031991880263, |
|
"learning_rate": 6.245772112360568e-05, |
|
"loss": 0.0817, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9657534246575342, |
|
"grad_norm": 0.03537251059587243, |
|
"learning_rate": 6.187620613222544e-05, |
|
"loss": 0.0768, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.9726027397260274, |
|
"grad_norm": 0.04616032239868132, |
|
"learning_rate": 6.129298360556304e-05, |
|
"loss": 0.0802, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9794520547945206, |
|
"grad_norm": 0.03412880356569236, |
|
"learning_rate": 6.070813739809442e-05, |
|
"loss": 0.0574, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.9863013698630136, |
|
"grad_norm": 0.02909505969006464, |
|
"learning_rate": 6.012175159774488e-05, |
|
"loss": 0.0604, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.9931506849315068, |
|
"grad_norm": 0.03935910262269513, |
|
"learning_rate": 5.953391051379904e-05, |
|
"loss": 0.0584, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.03397983912672022, |
|
"learning_rate": 5.894469866477905e-05, |
|
"loss": 0.0577, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.0068493150684932, |
|
"grad_norm": 0.0311452791456704, |
|
"learning_rate": 5.8354200766292734e-05, |
|
"loss": 0.079, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.0136986301369864, |
|
"grad_norm": 0.033811357449414096, |
|
"learning_rate": 5.776250171885329e-05, |
|
"loss": 0.0656, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.0205479452054795, |
|
"grad_norm": 0.040640212988149904, |
|
"learning_rate": 5.716968659567256e-05, |
|
"loss": 0.0728, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.0273972602739727, |
|
"grad_norm": 0.03057953268248259, |
|
"learning_rate": 5.6575840630429286e-05, |
|
"loss": 0.0733, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0342465753424657, |
|
"grad_norm": 0.032703562266556005, |
|
"learning_rate": 5.5981049205014546e-05, |
|
"loss": 0.0712, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.0410958904109588, |
|
"grad_norm": 0.03210664170957404, |
|
"learning_rate": 5.5385397837255556e-05, |
|
"loss": 0.0691, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.047945205479452, |
|
"grad_norm": 0.03471378475737469, |
|
"learning_rate": 5.4788972168620255e-05, |
|
"loss": 0.0757, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.0547945205479452, |
|
"grad_norm": 0.03165599022030969, |
|
"learning_rate": 5.4191857951903826e-05, |
|
"loss": 0.0597, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.0616438356164384, |
|
"grad_norm": 0.05451608731514093, |
|
"learning_rate": 5.359414103889947e-05, |
|
"loss": 0.0894, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.0684931506849316, |
|
"grad_norm": 0.04358798536487321, |
|
"learning_rate": 5.29959073680547e-05, |
|
"loss": 0.0782, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.0753424657534247, |
|
"grad_norm": 0.032277506655898194, |
|
"learning_rate": 5.239724295211541e-05, |
|
"loss": 0.0704, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.0821917808219177, |
|
"grad_norm": 0.028670588244485373, |
|
"learning_rate": 5.179823386575907e-05, |
|
"loss": 0.0507, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.0890410958904109, |
|
"grad_norm": 0.03036746716980511, |
|
"learning_rate": 5.119896623321909e-05, |
|
"loss": 0.0527, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.095890410958904, |
|
"grad_norm": 0.03329805529241328, |
|
"learning_rate": 5.059952621590216e-05, |
|
"loss": 0.0623, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.1027397260273972, |
|
"grad_norm": 0.037603460102786174, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0695, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.1095890410958904, |
|
"grad_norm": 0.04120763652112312, |
|
"learning_rate": 4.940047378409786e-05, |
|
"loss": 0.0711, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.1164383561643836, |
|
"grad_norm": 0.0415019727955413, |
|
"learning_rate": 4.880103376678092e-05, |
|
"loss": 0.0638, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.1232876712328768, |
|
"grad_norm": 0.032850408356261955, |
|
"learning_rate": 4.820176613424095e-05, |
|
"loss": 0.0502, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.13013698630137, |
|
"grad_norm": 0.051682446059570064, |
|
"learning_rate": 4.7602757047884595e-05, |
|
"loss": 0.0658, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.1369863013698631, |
|
"grad_norm": 0.03126729469626143, |
|
"learning_rate": 4.700409263194531e-05, |
|
"loss": 0.0506, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.143835616438356, |
|
"grad_norm": 0.03940644676794397, |
|
"learning_rate": 4.640585896110054e-05, |
|
"loss": 0.0632, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.1506849315068493, |
|
"grad_norm": 0.04126874600149052, |
|
"learning_rate": 4.580814204809618e-05, |
|
"loss": 0.0813, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.1575342465753424, |
|
"grad_norm": 0.04156579483287442, |
|
"learning_rate": 4.5211027831379757e-05, |
|
"loss": 0.0652, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.1643835616438356, |
|
"grad_norm": 0.03391355375732456, |
|
"learning_rate": 4.461460216274445e-05, |
|
"loss": 0.057, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1712328767123288, |
|
"grad_norm": 0.038632199419011234, |
|
"learning_rate": 4.401895079498547e-05, |
|
"loss": 0.0661, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.178082191780822, |
|
"grad_norm": 0.03708167445939923, |
|
"learning_rate": 4.3424159369570725e-05, |
|
"loss": 0.0624, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.1849315068493151, |
|
"grad_norm": 0.03753946698450887, |
|
"learning_rate": 4.283031340432747e-05, |
|
"loss": 0.0612, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.191780821917808, |
|
"grad_norm": 0.03537905440706239, |
|
"learning_rate": 4.223749828114672e-05, |
|
"loss": 0.0673, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.1986301369863013, |
|
"grad_norm": 0.03940327075341433, |
|
"learning_rate": 4.1645799233707284e-05, |
|
"loss": 0.0607, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.2054794520547945, |
|
"grad_norm": 0.041138452676237995, |
|
"learning_rate": 4.1055301335220955e-05, |
|
"loss": 0.0757, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.2123287671232876, |
|
"grad_norm": 0.03578703324326037, |
|
"learning_rate": 4.0466089486200976e-05, |
|
"loss": 0.0707, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.2191780821917808, |
|
"grad_norm": 0.039832365736742495, |
|
"learning_rate": 3.987824840225512e-05, |
|
"loss": 0.071, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.226027397260274, |
|
"grad_norm": 0.038723411367421474, |
|
"learning_rate": 3.9291862601905595e-05, |
|
"loss": 0.0728, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.2328767123287672, |
|
"grad_norm": 0.04498308725984581, |
|
"learning_rate": 3.870701639443698e-05, |
|
"loss": 0.0703, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.2397260273972603, |
|
"grad_norm": 0.031865141682931065, |
|
"learning_rate": 3.812379386777457e-05, |
|
"loss": 0.0523, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.2465753424657535, |
|
"grad_norm": 0.038277951507247616, |
|
"learning_rate": 3.7542278876394336e-05, |
|
"loss": 0.06, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.2534246575342465, |
|
"grad_norm": 0.0650327431598633, |
|
"learning_rate": 3.696255502926649e-05, |
|
"loss": 0.0586, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.2602739726027397, |
|
"grad_norm": 0.041491341553849696, |
|
"learning_rate": 3.638470567783442e-05, |
|
"loss": 0.0741, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.2671232876712328, |
|
"grad_norm": 0.03898025654685055, |
|
"learning_rate": 3.580881390403052e-05, |
|
"loss": 0.07, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.273972602739726, |
|
"grad_norm": 0.03741545061342083, |
|
"learning_rate": 3.5234962508330974e-05, |
|
"loss": 0.0648, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.2808219178082192, |
|
"grad_norm": 0.03579074437335611, |
|
"learning_rate": 3.466323399785072e-05, |
|
"loss": 0.0519, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.2876712328767124, |
|
"grad_norm": 0.03622522476818763, |
|
"learning_rate": 3.409371057448092e-05, |
|
"loss": 0.0581, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.2945205479452055, |
|
"grad_norm": 0.05446559861941158, |
|
"learning_rate": 3.352647412307002e-05, |
|
"loss": 0.0714, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.3013698630136985, |
|
"grad_norm": 0.03832700455471586, |
|
"learning_rate": 3.296160619965056e-05, |
|
"loss": 0.055, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.308219178082192, |
|
"grad_norm": 0.03896050037792126, |
|
"learning_rate": 3.239918801971332e-05, |
|
"loss": 0.062, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.3150684931506849, |
|
"grad_norm": 0.03267778522774074, |
|
"learning_rate": 3.183930044653014e-05, |
|
"loss": 0.0546, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.321917808219178, |
|
"grad_norm": 0.04339482297121572, |
|
"learning_rate": 3.1282023979527805e-05, |
|
"loss": 0.0676, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.3287671232876712, |
|
"grad_norm": 0.04149269813272275, |
|
"learning_rate": 3.072743874271376e-05, |
|
"loss": 0.0576, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.3356164383561644, |
|
"grad_norm": 0.05669408827237824, |
|
"learning_rate": 3.0175624473156316e-05, |
|
"loss": 0.0559, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.3424657534246576, |
|
"grad_norm": 0.04316637345791664, |
|
"learning_rate": 2.962666050951997e-05, |
|
"loss": 0.067, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.3493150684931507, |
|
"grad_norm": 0.04179017639942075, |
|
"learning_rate": 2.9080625780658455e-05, |
|
"loss": 0.0585, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.356164383561644, |
|
"grad_norm": 0.04086829863785544, |
|
"learning_rate": 2.853759879426644e-05, |
|
"loss": 0.0637, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.3630136986301369, |
|
"grad_norm": 0.045447600350027706, |
|
"learning_rate": 2.7997657625591867e-05, |
|
"loss": 0.0578, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.36986301369863, |
|
"grad_norm": 0.04063023817685305, |
|
"learning_rate": 2.7460879906210487e-05, |
|
"loss": 0.0704, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.36986301369863, |
|
"eval_loss": 0.07383698970079422, |
|
"eval_runtime": 6.4837, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.308, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.3767123287671232, |
|
"grad_norm": 0.043841411312286985, |
|
"learning_rate": 2.6927342812864116e-05, |
|
"loss": 0.0667, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.3835616438356164, |
|
"grad_norm": 0.04301017602851657, |
|
"learning_rate": 2.6397123056364365e-05, |
|
"loss": 0.0726, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.3904109589041096, |
|
"grad_norm": 0.042839496066386186, |
|
"learning_rate": 2.5870296870563286e-05, |
|
"loss": 0.0602, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.3972602739726028, |
|
"grad_norm": 0.03829564126132437, |
|
"learning_rate": 2.5346940001392728e-05, |
|
"loss": 0.0645, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.404109589041096, |
|
"grad_norm": 0.03507625071131985, |
|
"learning_rate": 2.482712769597363e-05, |
|
"loss": 0.0543, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.410958904109589, |
|
"grad_norm": 0.03743875489488463, |
|
"learning_rate": 2.4310934691797203e-05, |
|
"loss": 0.0602, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.4178082191780823, |
|
"grad_norm": 0.037325448164000144, |
|
"learning_rate": 2.379843520597937e-05, |
|
"loss": 0.056, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.4246575342465753, |
|
"grad_norm": 0.04519550162385289, |
|
"learning_rate": 2.3289702924589914e-05, |
|
"loss": 0.0823, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.4315068493150684, |
|
"grad_norm": 0.041415718432946946, |
|
"learning_rate": 2.2784810992058154e-05, |
|
"loss": 0.069, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.4383561643835616, |
|
"grad_norm": 0.04615134339463282, |
|
"learning_rate": 2.22838320006563e-05, |
|
"loss": 0.0709, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.4452054794520548, |
|
"grad_norm": 0.03508689289745425, |
|
"learning_rate": 2.1786837980062342e-05, |
|
"loss": 0.0562, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.452054794520548, |
|
"grad_norm": 0.042082412191430475, |
|
"learning_rate": 2.129390038700374e-05, |
|
"loss": 0.0711, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.4589041095890412, |
|
"grad_norm": 0.04921460988289041, |
|
"learning_rate": 2.0805090094983636e-05, |
|
"loss": 0.066, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.4657534246575343, |
|
"grad_norm": 0.03476195821739949, |
|
"learning_rate": 2.0320477384090665e-05, |
|
"loss": 0.0516, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.4726027397260273, |
|
"grad_norm": 0.03974475045265387, |
|
"learning_rate": 1.9840131930894333e-05, |
|
"loss": 0.0507, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.4794520547945205, |
|
"grad_norm": 0.04495458260995571, |
|
"learning_rate": 1.936412279842705e-05, |
|
"loss": 0.071, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.4863013698630136, |
|
"grad_norm": 0.032618789951219244, |
|
"learning_rate": 1.8892518426254364e-05, |
|
"loss": 0.0487, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.4931506849315068, |
|
"grad_norm": 0.05233338159738159, |
|
"learning_rate": 1.842538662063496e-05, |
|
"loss": 0.0779, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.043093246770756745, |
|
"learning_rate": 1.7962794544771477e-05, |
|
"loss": 0.0731, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.5068493150684932, |
|
"grad_norm": 0.04778965621357637, |
|
"learning_rate": 1.7504808709154104e-05, |
|
"loss": 0.0767, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.5136986301369864, |
|
"grad_norm": 0.04225455780801827, |
|
"learning_rate": 1.705149496199762e-05, |
|
"loss": 0.065, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.5205479452054793, |
|
"grad_norm": 0.03777201213330916, |
|
"learning_rate": 1.6602918479774148e-05, |
|
"loss": 0.0587, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.5273972602739727, |
|
"grad_norm": 0.050670841355552755, |
|
"learning_rate": 1.6159143757842004e-05, |
|
"loss": 0.0694, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.5342465753424657, |
|
"grad_norm": 0.04169608777354125, |
|
"learning_rate": 1.5720234601172766e-05, |
|
"loss": 0.0622, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.541095890410959, |
|
"grad_norm": 0.0399325959403793, |
|
"learning_rate": 1.5286254115177623e-05, |
|
"loss": 0.0559, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.547945205479452, |
|
"grad_norm": 0.04347771997206968, |
|
"learning_rate": 1.485726469663401e-05, |
|
"loss": 0.0625, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.5547945205479452, |
|
"grad_norm": 0.04329191462577287, |
|
"learning_rate": 1.4433328024714581e-05, |
|
"loss": 0.0682, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.5616438356164384, |
|
"grad_norm": 0.045935945373833685, |
|
"learning_rate": 1.4014505052118892e-05, |
|
"loss": 0.063, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.5684931506849316, |
|
"grad_norm": 0.03953171472704261, |
|
"learning_rate": 1.3600855996309935e-05, |
|
"loss": 0.0621, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.5753424657534247, |
|
"grad_norm": 0.042842743444326405, |
|
"learning_rate": 1.3192440330856004e-05, |
|
"loss": 0.0584, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.5821917808219177, |
|
"grad_norm": 0.04147136350311881, |
|
"learning_rate": 1.2789316776879939e-05, |
|
"loss": 0.0571, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.589041095890411, |
|
"grad_norm": 0.041096032631508496, |
|
"learning_rate": 1.2391543294616147e-05, |
|
"loss": 0.0569, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.595890410958904, |
|
"grad_norm": 0.04826360135309353, |
|
"learning_rate": 1.1999177075077278e-05, |
|
"loss": 0.0749, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.6027397260273972, |
|
"grad_norm": 0.034935721211031806, |
|
"learning_rate": 1.1612274531831463e-05, |
|
"loss": 0.0514, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.6095890410958904, |
|
"grad_norm": 0.04282701283898458, |
|
"learning_rate": 1.123089129289117e-05, |
|
"loss": 0.0671, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.6164383561643836, |
|
"grad_norm": 0.05961503389381486, |
|
"learning_rate": 1.0855082192715294e-05, |
|
"loss": 0.0665, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.6232876712328768, |
|
"grad_norm": 0.06432557444245549, |
|
"learning_rate": 1.0484901264325025e-05, |
|
"loss": 0.0755, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.6301369863013697, |
|
"grad_norm": 0.04253716073284135, |
|
"learning_rate": 1.0120401731535212e-05, |
|
"loss": 0.0733, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.6369863013698631, |
|
"grad_norm": 0.0774475760401046, |
|
"learning_rate": 9.761636001301871e-06, |
|
"loss": 0.065, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.643835616438356, |
|
"grad_norm": 0.03996033149580056, |
|
"learning_rate": 9.408655656187282e-06, |
|
"loss": 0.065, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.6506849315068495, |
|
"grad_norm": 0.03785205141208416, |
|
"learning_rate": 9.061511446943533e-06, |
|
"loss": 0.0548, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.6575342465753424, |
|
"grad_norm": 0.03850363195978434, |
|
"learning_rate": 8.720253285215685e-06, |
|
"loss": 0.0587, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.6643835616438356, |
|
"grad_norm": 0.0469269115044975, |
|
"learning_rate": 8.384930236365629e-06, |
|
"loss": 0.0634, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.6712328767123288, |
|
"grad_norm": 0.0498361427852067, |
|
"learning_rate": 8.0555905124175e-06, |
|
"loss": 0.051, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.678082191780822, |
|
"grad_norm": 0.04702037163211539, |
|
"learning_rate": 7.732281465125907e-06, |
|
"loss": 0.0677, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.6849315068493151, |
|
"grad_norm": 0.04724045679820981, |
|
"learning_rate": 7.415049579167782e-06, |
|
"loss": 0.0731, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.691780821917808, |
|
"grad_norm": 0.04960546509274232, |
|
"learning_rate": 7.103940465458936e-06, |
|
"loss": 0.0777, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.6986301369863015, |
|
"grad_norm": 0.04749987160293405, |
|
"learning_rate": 6.798998854596189e-06, |
|
"loss": 0.0531, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.7054794520547945, |
|
"grad_norm": 0.04656027534681338, |
|
"learning_rate": 6.500268590426106e-06, |
|
"loss": 0.0547, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.7123287671232876, |
|
"grad_norm": 0.043116663189771226, |
|
"learning_rate": 6.207792623741249e-06, |
|
"loss": 0.0677, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.7191780821917808, |
|
"grad_norm": 0.04714594190332252, |
|
"learning_rate": 5.9216130061047646e-06, |
|
"loss": 0.068, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.726027397260274, |
|
"grad_norm": 0.0442482453639658, |
|
"learning_rate": 5.641770883804365e-06, |
|
"loss": 0.0602, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.7328767123287672, |
|
"grad_norm": 0.044456632129314805, |
|
"learning_rate": 5.368306491936325e-06, |
|
"loss": 0.0661, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.7397260273972601, |
|
"grad_norm": 0.04864641601498424, |
|
"learning_rate": 5.101259148620619e-06, |
|
"loss": 0.0676, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.7465753424657535, |
|
"grad_norm": 0.043784015189862524, |
|
"learning_rate": 4.840667249347824e-06, |
|
"loss": 0.0669, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.7534246575342465, |
|
"grad_norm": 0.043216466288645235, |
|
"learning_rate": 4.586568261458729e-06, |
|
"loss": 0.0644, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.7602739726027399, |
|
"grad_norm": 0.042340118838538034, |
|
"learning_rate": 4.3389987187573145e-06, |
|
"loss": 0.053, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.7671232876712328, |
|
"grad_norm": 0.03965234038836836, |
|
"learning_rate": 4.097994216258039e-06, |
|
"loss": 0.0528, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.773972602739726, |
|
"grad_norm": 0.04330912545003313, |
|
"learning_rate": 3.8635894050680466e-06, |
|
"loss": 0.062, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.7808219178082192, |
|
"grad_norm": 0.04245993591175113, |
|
"learning_rate": 3.63581798740511e-06, |
|
"loss": 0.0682, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.7876712328767124, |
|
"grad_norm": 0.046454610509872485, |
|
"learning_rate": 3.4147127117520104e-06, |
|
"loss": 0.0773, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.7945205479452055, |
|
"grad_norm": 0.061061675638647096, |
|
"learning_rate": 3.2003053681480098e-06, |
|
"loss": 0.0628, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.8013698630136985, |
|
"grad_norm": 0.04251403620622773, |
|
"learning_rate": 2.992626783618152e-06, |
|
"loss": 0.0506, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.808219178082192, |
|
"grad_norm": 0.04783592253659306, |
|
"learning_rate": 2.791706817741041e-06, |
|
"loss": 0.0737, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.8150684931506849, |
|
"grad_norm": 0.04359969004920878, |
|
"learning_rate": 2.59757435835567e-06, |
|
"loss": 0.0599, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.821917808219178, |
|
"grad_norm": 0.044487631888004406, |
|
"learning_rate": 2.41025731740801e-06, |
|
"loss": 0.0692, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.8287671232876712, |
|
"grad_norm": 0.04510155943713286, |
|
"learning_rate": 2.229782626937865e-06, |
|
"loss": 0.0633, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.8356164383561644, |
|
"grad_norm": 0.04035438478741115, |
|
"learning_rate": 2.056176235206664e-06, |
|
"loss": 0.0601, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.8424657534246576, |
|
"grad_norm": 0.046645570169377086, |
|
"learning_rate": 1.889463102966671e-06, |
|
"loss": 0.0671, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.8493150684931505, |
|
"grad_norm": 0.045925921341422184, |
|
"learning_rate": 1.729667199872187e-06, |
|
"loss": 0.0796, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.856164383561644, |
|
"grad_norm": 0.047745127307766096, |
|
"learning_rate": 1.5768115010332208e-06, |
|
"loss": 0.0681, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.8630136986301369, |
|
"grad_norm": 0.04828726659252653, |
|
"learning_rate": 1.4309179837122044e-06, |
|
"loss": 0.0637, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.8698630136986303, |
|
"grad_norm": 0.03831444871977773, |
|
"learning_rate": 1.2920076241641376e-06, |
|
"loss": 0.0537, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.8767123287671232, |
|
"grad_norm": 0.043143594829470666, |
|
"learning_rate": 1.1601003946206724e-06, |
|
"loss": 0.0645, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.8835616438356164, |
|
"grad_norm": 0.04108818302281154, |
|
"learning_rate": 1.0352152604185428e-06, |
|
"loss": 0.0596, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.8904109589041096, |
|
"grad_norm": 0.04419177931878111, |
|
"learning_rate": 9.17370177272775e-07, |
|
"loss": 0.0681, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.8972602739726028, |
|
"grad_norm": 0.04544816624943878, |
|
"learning_rate": 8.065820886950404e-07, |
|
"loss": 0.0665, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.904109589041096, |
|
"grad_norm": 0.044560046311441934, |
|
"learning_rate": 7.028669235575714e-07, |
|
"loss": 0.0665, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.910958904109589, |
|
"grad_norm": 0.046336508445476945, |
|
"learning_rate": 6.062395938029485e-07, |
|
"loss": 0.0715, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.9178082191780823, |
|
"grad_norm": 0.046783520939667735, |
|
"learning_rate": 5.167139923000553e-07, |
|
"loss": 0.0671, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.9246575342465753, |
|
"grad_norm": 0.045134549211076444, |
|
"learning_rate": 4.343029908466301e-07, |
|
"loss": 0.0651, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.9315068493150684, |
|
"grad_norm": 0.04620102733800356, |
|
"learning_rate": 3.5901843831857576e-07, |
|
"loss": 0.0611, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.9383561643835616, |
|
"grad_norm": 0.04290459317190587, |
|
"learning_rate": 2.908711589663549e-07, |
|
"loss": 0.0688, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.9452054794520548, |
|
"grad_norm": 0.0459065259141636, |
|
"learning_rate": 2.2987095085867937e-07, |
|
"loss": 0.0627, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.952054794520548, |
|
"grad_norm": 0.062357302229075894, |
|
"learning_rate": 1.760265844738096e-07, |
|
"loss": 0.0792, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.958904109589041, |
|
"grad_norm": 0.044164802827650715, |
|
"learning_rate": 1.2934580143851295e-07, |
|
"loss": 0.0601, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.9657534246575343, |
|
"grad_norm": 0.03615207612143871, |
|
"learning_rate": 8.983531341500983e-08, |
|
"loss": 0.0512, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.9726027397260273, |
|
"grad_norm": 0.04362072520783342, |
|
"learning_rate": 5.750080113598455e-08, |
|
"loss": 0.0616, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.9794520547945207, |
|
"grad_norm": 0.04831491776296162, |
|
"learning_rate": 3.2346913587816275e-08, |
|
"loss": 0.0731, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.9863013698630136, |
|
"grad_norm": 0.050700245606893124, |
|
"learning_rate": 1.4377267342158274e-08, |
|
"loss": 0.0733, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.9931506849315068, |
|
"grad_norm": 0.04317508790841958, |
|
"learning_rate": 3.594446035964927e-09, |
|
"loss": 0.054, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.05236597882946408, |
|
"learning_rate": 0.0, |
|
"loss": 0.0588, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 292, |
|
"total_flos": 739456868941824.0, |
|
"train_loss": 0.07426239744032899, |
|
"train_runtime": 2224.6089, |
|
"train_samples_per_second": 0.523, |
|
"train_steps_per_second": 0.131 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 292, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 739456868941824.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|