cutelemonlili's picture
Add files using upload-large-folder tool
33922fc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 200,
"global_step": 292,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00684931506849315,
"grad_norm": 0.030539813126927277,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.1076,
"step": 1
},
{
"epoch": 0.0136986301369863,
"grad_norm": 0.03492658058904307,
"learning_rate": 6.666666666666667e-06,
"loss": 0.1246,
"step": 2
},
{
"epoch": 0.02054794520547945,
"grad_norm": 0.04865725802068095,
"learning_rate": 1e-05,
"loss": 0.1586,
"step": 3
},
{
"epoch": 0.0273972602739726,
"grad_norm": 0.05113567210987629,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.1514,
"step": 4
},
{
"epoch": 0.03424657534246575,
"grad_norm": 0.04332482337715696,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.1334,
"step": 5
},
{
"epoch": 0.0410958904109589,
"grad_norm": 0.056501517819200174,
"learning_rate": 2e-05,
"loss": 0.162,
"step": 6
},
{
"epoch": 0.04794520547945205,
"grad_norm": 0.02608028374746435,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.1031,
"step": 7
},
{
"epoch": 0.0547945205479452,
"grad_norm": 0.03653546691009632,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.1047,
"step": 8
},
{
"epoch": 0.06164383561643835,
"grad_norm": 0.036654425448601695,
"learning_rate": 3e-05,
"loss": 0.1063,
"step": 9
},
{
"epoch": 0.0684931506849315,
"grad_norm": 0.07120435117331021,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.1899,
"step": 10
},
{
"epoch": 0.07534246575342465,
"grad_norm": 0.052020978283898525,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.1369,
"step": 11
},
{
"epoch": 0.0821917808219178,
"grad_norm": 0.05373108737583589,
"learning_rate": 4e-05,
"loss": 0.1232,
"step": 12
},
{
"epoch": 0.08904109589041095,
"grad_norm": 0.04869530777109541,
"learning_rate": 4.3333333333333334e-05,
"loss": 0.1137,
"step": 13
},
{
"epoch": 0.0958904109589041,
"grad_norm": 0.05909792067471564,
"learning_rate": 4.666666666666667e-05,
"loss": 0.1211,
"step": 14
},
{
"epoch": 0.10273972602739725,
"grad_norm": 0.07782073062487518,
"learning_rate": 5e-05,
"loss": 0.1419,
"step": 15
},
{
"epoch": 0.1095890410958904,
"grad_norm": 0.10068121599027634,
"learning_rate": 5.333333333333333e-05,
"loss": 0.1573,
"step": 16
},
{
"epoch": 0.11643835616438356,
"grad_norm": 0.0794018071448985,
"learning_rate": 5.666666666666667e-05,
"loss": 0.1368,
"step": 17
},
{
"epoch": 0.1232876712328767,
"grad_norm": 0.03925546794880312,
"learning_rate": 6e-05,
"loss": 0.0936,
"step": 18
},
{
"epoch": 0.13013698630136986,
"grad_norm": 0.09398334893562084,
"learning_rate": 6.333333333333333e-05,
"loss": 0.1405,
"step": 19
},
{
"epoch": 0.136986301369863,
"grad_norm": 0.03366472291998534,
"learning_rate": 6.666666666666667e-05,
"loss": 0.0939,
"step": 20
},
{
"epoch": 0.14383561643835616,
"grad_norm": 0.03002583884373123,
"learning_rate": 7e-05,
"loss": 0.0857,
"step": 21
},
{
"epoch": 0.1506849315068493,
"grad_norm": 0.04526244411827793,
"learning_rate": 7.333333333333333e-05,
"loss": 0.0932,
"step": 22
},
{
"epoch": 0.15753424657534246,
"grad_norm": 0.0271467232481723,
"learning_rate": 7.666666666666667e-05,
"loss": 0.0926,
"step": 23
},
{
"epoch": 0.1643835616438356,
"grad_norm": 0.036006533907184064,
"learning_rate": 8e-05,
"loss": 0.1092,
"step": 24
},
{
"epoch": 0.17123287671232876,
"grad_norm": 0.03819263334586657,
"learning_rate": 8.333333333333334e-05,
"loss": 0.0967,
"step": 25
},
{
"epoch": 0.1780821917808219,
"grad_norm": 0.04732574961540268,
"learning_rate": 8.666666666666667e-05,
"loss": 0.0933,
"step": 26
},
{
"epoch": 0.18493150684931506,
"grad_norm": 0.04933244373512878,
"learning_rate": 9e-05,
"loss": 0.0964,
"step": 27
},
{
"epoch": 0.1917808219178082,
"grad_norm": 0.05164570324383495,
"learning_rate": 9.333333333333334e-05,
"loss": 0.0893,
"step": 28
},
{
"epoch": 0.19863013698630136,
"grad_norm": 0.06506455363157941,
"learning_rate": 9.666666666666667e-05,
"loss": 0.1099,
"step": 29
},
{
"epoch": 0.2054794520547945,
"grad_norm": 0.05004738114863798,
"learning_rate": 0.0001,
"loss": 0.093,
"step": 30
},
{
"epoch": 0.21232876712328766,
"grad_norm": 0.058372907031022096,
"learning_rate": 9.999640555396404e-05,
"loss": 0.0859,
"step": 31
},
{
"epoch": 0.2191780821917808,
"grad_norm": 0.04214183460112646,
"learning_rate": 9.998562273265785e-05,
"loss": 0.0757,
"step": 32
},
{
"epoch": 0.22602739726027396,
"grad_norm": 0.045590856190952045,
"learning_rate": 9.996765308641218e-05,
"loss": 0.0909,
"step": 33
},
{
"epoch": 0.2328767123287671,
"grad_norm": 0.03615919142026033,
"learning_rate": 9.994249919886402e-05,
"loss": 0.0773,
"step": 34
},
{
"epoch": 0.23972602739726026,
"grad_norm": 0.053734428778710715,
"learning_rate": 9.991016468658499e-05,
"loss": 0.0804,
"step": 35
},
{
"epoch": 0.2465753424657534,
"grad_norm": 0.03350548900671173,
"learning_rate": 9.98706541985615e-05,
"loss": 0.0949,
"step": 36
},
{
"epoch": 0.2534246575342466,
"grad_norm": 0.027043847680669336,
"learning_rate": 9.98239734155262e-05,
"loss": 0.0689,
"step": 37
},
{
"epoch": 0.2602739726027397,
"grad_norm": 0.03089668501975604,
"learning_rate": 9.977012904914133e-05,
"loss": 0.0896,
"step": 38
},
{
"epoch": 0.2671232876712329,
"grad_norm": 0.029975461306757812,
"learning_rate": 9.970912884103364e-05,
"loss": 0.0818,
"step": 39
},
{
"epoch": 0.273972602739726,
"grad_norm": 0.029112303333024562,
"learning_rate": 9.964098156168142e-05,
"loss": 0.0764,
"step": 40
},
{
"epoch": 0.2808219178082192,
"grad_norm": 0.03609528093820637,
"learning_rate": 9.956569700915337e-05,
"loss": 0.0896,
"step": 41
},
{
"epoch": 0.2876712328767123,
"grad_norm": 0.02923285599907434,
"learning_rate": 9.948328600769995e-05,
"loss": 0.0687,
"step": 42
},
{
"epoch": 0.2945205479452055,
"grad_norm": 0.03722889238353647,
"learning_rate": 9.939376040619705e-05,
"loss": 0.0904,
"step": 43
},
{
"epoch": 0.3013698630136986,
"grad_norm": 0.032059757320644165,
"learning_rate": 9.929713307644244e-05,
"loss": 0.0834,
"step": 44
},
{
"epoch": 0.3082191780821918,
"grad_norm": 0.04817854262074185,
"learning_rate": 9.919341791130496e-05,
"loss": 0.0913,
"step": 45
},
{
"epoch": 0.3150684931506849,
"grad_norm": 0.027208193309115385,
"learning_rate": 9.908262982272724e-05,
"loss": 0.0783,
"step": 46
},
{
"epoch": 0.3219178082191781,
"grad_norm": 0.03396529679519977,
"learning_rate": 9.896478473958146e-05,
"loss": 0.0788,
"step": 47
},
{
"epoch": 0.3287671232876712,
"grad_norm": 0.03703807434511364,
"learning_rate": 9.883989960537933e-05,
"loss": 0.0737,
"step": 48
},
{
"epoch": 0.3356164383561644,
"grad_norm": 0.037750701903354035,
"learning_rate": 9.870799237583587e-05,
"loss": 0.0879,
"step": 49
},
{
"epoch": 0.3424657534246575,
"grad_norm": 0.0421005656051737,
"learning_rate": 9.85690820162878e-05,
"loss": 0.0762,
"step": 50
},
{
"epoch": 0.3493150684931507,
"grad_norm": 0.02733701424368454,
"learning_rate": 9.842318849896679e-05,
"loss": 0.0716,
"step": 51
},
{
"epoch": 0.3561643835616438,
"grad_norm": 0.027664399030788704,
"learning_rate": 9.827033280012783e-05,
"loss": 0.0783,
"step": 52
},
{
"epoch": 0.363013698630137,
"grad_norm": 0.030867201947854212,
"learning_rate": 9.811053689703334e-05,
"loss": 0.0895,
"step": 53
},
{
"epoch": 0.3698630136986301,
"grad_norm": 0.02445456343322928,
"learning_rate": 9.794382376479334e-05,
"loss": 0.0669,
"step": 54
},
{
"epoch": 0.3767123287671233,
"grad_norm": 0.0355725180623145,
"learning_rate": 9.777021737306214e-05,
"loss": 0.0758,
"step": 55
},
{
"epoch": 0.3835616438356164,
"grad_norm": 0.02787477968018966,
"learning_rate": 9.7589742682592e-05,
"loss": 0.0671,
"step": 56
},
{
"epoch": 0.3904109589041096,
"grad_norm": 0.02917250086551862,
"learning_rate": 9.740242564164434e-05,
"loss": 0.0777,
"step": 57
},
{
"epoch": 0.3972602739726027,
"grad_norm": 0.027367394927186293,
"learning_rate": 9.720829318225897e-05,
"loss": 0.0784,
"step": 58
},
{
"epoch": 0.4041095890410959,
"grad_norm": 0.03404049628582292,
"learning_rate": 9.700737321638185e-05,
"loss": 0.0831,
"step": 59
},
{
"epoch": 0.410958904109589,
"grad_norm": 0.03007296844683444,
"learning_rate": 9.6799694631852e-05,
"loss": 0.0666,
"step": 60
},
{
"epoch": 0.4178082191780822,
"grad_norm": 0.02540452547965842,
"learning_rate": 9.6585287288248e-05,
"loss": 0.0661,
"step": 61
},
{
"epoch": 0.4246575342465753,
"grad_norm": 0.026781782354160222,
"learning_rate": 9.63641820125949e-05,
"loss": 0.074,
"step": 62
},
{
"epoch": 0.4315068493150685,
"grad_norm": 0.026783892217477907,
"learning_rate": 9.613641059493197e-05,
"loss": 0.066,
"step": 63
},
{
"epoch": 0.4383561643835616,
"grad_norm": 0.03124681883629973,
"learning_rate": 9.590200578374198e-05,
"loss": 0.0704,
"step": 64
},
{
"epoch": 0.4452054794520548,
"grad_norm": 0.03417198666061255,
"learning_rate": 9.56610012812427e-05,
"loss": 0.0809,
"step": 65
},
{
"epoch": 0.4520547945205479,
"grad_norm": 0.03459134201897623,
"learning_rate": 9.541343173854127e-05,
"loss": 0.0775,
"step": 66
},
{
"epoch": 0.4589041095890411,
"grad_norm": 0.028846468217507328,
"learning_rate": 9.515933275065219e-05,
"loss": 0.0683,
"step": 67
},
{
"epoch": 0.4657534246575342,
"grad_norm": 0.02841488216076331,
"learning_rate": 9.48987408513794e-05,
"loss": 0.0682,
"step": 68
},
{
"epoch": 0.4726027397260274,
"grad_norm": 0.038470702709475786,
"learning_rate": 9.463169350806369e-05,
"loss": 0.1041,
"step": 69
},
{
"epoch": 0.4794520547945205,
"grad_norm": 0.05495636003850682,
"learning_rate": 9.435822911619564e-05,
"loss": 0.097,
"step": 70
},
{
"epoch": 0.4863013698630137,
"grad_norm": 0.03142706228563173,
"learning_rate": 9.407838699389524e-05,
"loss": 0.0844,
"step": 71
},
{
"epoch": 0.4931506849315068,
"grad_norm": 0.02769595486440407,
"learning_rate": 9.379220737625877e-05,
"loss": 0.0648,
"step": 72
},
{
"epoch": 0.5,
"grad_norm": 0.029889370715744733,
"learning_rate": 9.34997314095739e-05,
"loss": 0.0668,
"step": 73
},
{
"epoch": 0.5068493150684932,
"grad_norm": 0.036383935533411724,
"learning_rate": 9.320100114540382e-05,
"loss": 0.0676,
"step": 74
},
{
"epoch": 0.5136986301369864,
"grad_norm": 0.05144060226911856,
"learning_rate": 9.289605953454107e-05,
"loss": 0.0834,
"step": 75
},
{
"epoch": 0.5205479452054794,
"grad_norm": 0.029231837833789662,
"learning_rate": 9.258495042083221e-05,
"loss": 0.067,
"step": 76
},
{
"epoch": 0.5273972602739726,
"grad_norm": 0.030385743404608083,
"learning_rate": 9.22677185348741e-05,
"loss": 0.0809,
"step": 77
},
{
"epoch": 0.5342465753424658,
"grad_norm": 0.031195055928310497,
"learning_rate": 9.19444094875825e-05,
"loss": 0.0735,
"step": 78
},
{
"epoch": 0.541095890410959,
"grad_norm": 0.029330760873632865,
"learning_rate": 9.161506976363437e-05,
"loss": 0.0736,
"step": 79
},
{
"epoch": 0.547945205479452,
"grad_norm": 0.0332541385460945,
"learning_rate": 9.127974671478432e-05,
"loss": 0.0812,
"step": 80
},
{
"epoch": 0.5547945205479452,
"grad_norm": 0.03305866284735081,
"learning_rate": 9.093848855305649e-05,
"loss": 0.0822,
"step": 81
},
{
"epoch": 0.5616438356164384,
"grad_norm": 0.03432894975247522,
"learning_rate": 9.059134434381273e-05,
"loss": 0.0783,
"step": 82
},
{
"epoch": 0.5684931506849316,
"grad_norm": 0.039182983364365015,
"learning_rate": 9.023836399869814e-05,
"loss": 0.0815,
"step": 83
},
{
"epoch": 0.5753424657534246,
"grad_norm": 0.02912164254908111,
"learning_rate": 8.98795982684648e-05,
"loss": 0.0715,
"step": 84
},
{
"epoch": 0.5821917808219178,
"grad_norm": 0.029959446654787666,
"learning_rate": 8.951509873567499e-05,
"loss": 0.0754,
"step": 85
},
{
"epoch": 0.589041095890411,
"grad_norm": 0.02955747281592669,
"learning_rate": 8.914491780728471e-05,
"loss": 0.0683,
"step": 86
},
{
"epoch": 0.5958904109589042,
"grad_norm": 0.03207481946898706,
"learning_rate": 8.876910870710884e-05,
"loss": 0.0725,
"step": 87
},
{
"epoch": 0.6027397260273972,
"grad_norm": 0.024590915698935577,
"learning_rate": 8.838772546816856e-05,
"loss": 0.0569,
"step": 88
},
{
"epoch": 0.6095890410958904,
"grad_norm": 0.030750251732919592,
"learning_rate": 8.800082292492273e-05,
"loss": 0.0732,
"step": 89
},
{
"epoch": 0.6164383561643836,
"grad_norm": 0.032945297540624904,
"learning_rate": 8.760845670538387e-05,
"loss": 0.0787,
"step": 90
},
{
"epoch": 0.6232876712328768,
"grad_norm": 0.03778017209196359,
"learning_rate": 8.721068322312007e-05,
"loss": 0.0835,
"step": 91
},
{
"epoch": 0.6301369863013698,
"grad_norm": 0.03507952393071719,
"learning_rate": 8.680755966914401e-05,
"loss": 0.0785,
"step": 92
},
{
"epoch": 0.636986301369863,
"grad_norm": 0.03248195299567112,
"learning_rate": 8.639914400369009e-05,
"loss": 0.0769,
"step": 93
},
{
"epoch": 0.6438356164383562,
"grad_norm": 0.03731653007913649,
"learning_rate": 8.598549494788111e-05,
"loss": 0.0863,
"step": 94
},
{
"epoch": 0.6506849315068494,
"grad_norm": 0.03180106946934432,
"learning_rate": 8.556667197528543e-05,
"loss": 0.0601,
"step": 95
},
{
"epoch": 0.6575342465753424,
"grad_norm": 0.03375769439746087,
"learning_rate": 8.5142735303366e-05,
"loss": 0.0833,
"step": 96
},
{
"epoch": 0.6643835616438356,
"grad_norm": 0.03122748644136327,
"learning_rate": 8.47137458848224e-05,
"loss": 0.0653,
"step": 97
},
{
"epoch": 0.6712328767123288,
"grad_norm": 0.030566834172214723,
"learning_rate": 8.427976539882724e-05,
"loss": 0.0794,
"step": 98
},
{
"epoch": 0.678082191780822,
"grad_norm": 0.041027373975817134,
"learning_rate": 8.384085624215801e-05,
"loss": 0.094,
"step": 99
},
{
"epoch": 0.684931506849315,
"grad_norm": 0.02698160042089123,
"learning_rate": 8.339708152022585e-05,
"loss": 0.06,
"step": 100
},
{
"epoch": 0.6917808219178082,
"grad_norm": 0.026358548855895562,
"learning_rate": 8.294850503800238e-05,
"loss": 0.0586,
"step": 101
},
{
"epoch": 0.6986301369863014,
"grad_norm": 0.029315218243026478,
"learning_rate": 8.24951912908459e-05,
"loss": 0.0659,
"step": 102
},
{
"epoch": 0.7054794520547946,
"grad_norm": 0.03730243641514126,
"learning_rate": 8.203720545522853e-05,
"loss": 0.0768,
"step": 103
},
{
"epoch": 0.7123287671232876,
"grad_norm": 0.028421475360968528,
"learning_rate": 8.157461337936506e-05,
"loss": 0.0674,
"step": 104
},
{
"epoch": 0.7191780821917808,
"grad_norm": 0.036896447275521424,
"learning_rate": 8.110748157374565e-05,
"loss": 0.0813,
"step": 105
},
{
"epoch": 0.726027397260274,
"grad_norm": 0.03194136933101962,
"learning_rate": 8.063587720157298e-05,
"loss": 0.0661,
"step": 106
},
{
"epoch": 0.7328767123287672,
"grad_norm": 0.047606415740070754,
"learning_rate": 8.01598680691057e-05,
"loss": 0.0922,
"step": 107
},
{
"epoch": 0.7397260273972602,
"grad_norm": 0.03982021001322296,
"learning_rate": 7.967952261590935e-05,
"loss": 0.0677,
"step": 108
},
{
"epoch": 0.7465753424657534,
"grad_norm": 0.030844190311457055,
"learning_rate": 7.919490990501636e-05,
"loss": 0.0693,
"step": 109
},
{
"epoch": 0.7534246575342466,
"grad_norm": 0.03128080580199763,
"learning_rate": 7.870609961299627e-05,
"loss": 0.055,
"step": 110
},
{
"epoch": 0.7602739726027398,
"grad_norm": 0.03042131745977429,
"learning_rate": 7.821316201993767e-05,
"loss": 0.0696,
"step": 111
},
{
"epoch": 0.7671232876712328,
"grad_norm": 0.03086792896956372,
"learning_rate": 7.771616799934371e-05,
"loss": 0.0717,
"step": 112
},
{
"epoch": 0.773972602739726,
"grad_norm": 0.03186550616403287,
"learning_rate": 7.721518900794185e-05,
"loss": 0.0721,
"step": 113
},
{
"epoch": 0.7808219178082192,
"grad_norm": 0.029517902212737826,
"learning_rate": 7.67102970754101e-05,
"loss": 0.068,
"step": 114
},
{
"epoch": 0.7876712328767124,
"grad_norm": 0.03470530857876834,
"learning_rate": 7.620156479402066e-05,
"loss": 0.0687,
"step": 115
},
{
"epoch": 0.7945205479452054,
"grad_norm": 0.032497402299353856,
"learning_rate": 7.568906530820282e-05,
"loss": 0.0749,
"step": 116
},
{
"epoch": 0.8013698630136986,
"grad_norm": 0.03286496346308627,
"learning_rate": 7.517287230402639e-05,
"loss": 0.0769,
"step": 117
},
{
"epoch": 0.8082191780821918,
"grad_norm": 0.029927285174734974,
"learning_rate": 7.465305999860728e-05,
"loss": 0.0681,
"step": 118
},
{
"epoch": 0.815068493150685,
"grad_norm": 0.03427786976795219,
"learning_rate": 7.412970312943671e-05,
"loss": 0.0777,
"step": 119
},
{
"epoch": 0.821917808219178,
"grad_norm": 0.031851819549739036,
"learning_rate": 7.360287694363566e-05,
"loss": 0.0653,
"step": 120
},
{
"epoch": 0.8287671232876712,
"grad_norm": 0.03126182717380993,
"learning_rate": 7.30726571871359e-05,
"loss": 0.0741,
"step": 121
},
{
"epoch": 0.8356164383561644,
"grad_norm": 0.03232123010558018,
"learning_rate": 7.253912009378953e-05,
"loss": 0.0622,
"step": 122
},
{
"epoch": 0.8424657534246576,
"grad_norm": 0.031930228246767285,
"learning_rate": 7.200234237440815e-05,
"loss": 0.0711,
"step": 123
},
{
"epoch": 0.8493150684931506,
"grad_norm": 0.03869729803850467,
"learning_rate": 7.146240120573358e-05,
"loss": 0.0748,
"step": 124
},
{
"epoch": 0.8561643835616438,
"grad_norm": 0.04668355562414557,
"learning_rate": 7.091937421934157e-05,
"loss": 0.0719,
"step": 125
},
{
"epoch": 0.863013698630137,
"grad_norm": 0.045196133927728935,
"learning_rate": 7.037333949048005e-05,
"loss": 0.0801,
"step": 126
},
{
"epoch": 0.8698630136986302,
"grad_norm": 0.03140632320538256,
"learning_rate": 6.98243755268437e-05,
"loss": 0.0626,
"step": 127
},
{
"epoch": 0.8767123287671232,
"grad_norm": 0.052496609882496145,
"learning_rate": 6.927256125728624e-05,
"loss": 0.0777,
"step": 128
},
{
"epoch": 0.8835616438356164,
"grad_norm": 0.03868077359004108,
"learning_rate": 6.87179760204722e-05,
"loss": 0.0791,
"step": 129
},
{
"epoch": 0.8904109589041096,
"grad_norm": 0.0348172699895961,
"learning_rate": 6.816069955346985e-05,
"loss": 0.0743,
"step": 130
},
{
"epoch": 0.8972602739726028,
"grad_norm": 0.029728370506506084,
"learning_rate": 6.760081198028671e-05,
"loss": 0.062,
"step": 131
},
{
"epoch": 0.9041095890410958,
"grad_norm": 0.033304716743838475,
"learning_rate": 6.703839380034946e-05,
"loss": 0.0742,
"step": 132
},
{
"epoch": 0.910958904109589,
"grad_norm": 0.03402220621737934,
"learning_rate": 6.647352587693001e-05,
"loss": 0.074,
"step": 133
},
{
"epoch": 0.9178082191780822,
"grad_norm": 0.03029297413232278,
"learning_rate": 6.590628942551909e-05,
"loss": 0.0706,
"step": 134
},
{
"epoch": 0.9246575342465754,
"grad_norm": 0.03726620583459937,
"learning_rate": 6.533676600214928e-05,
"loss": 0.0711,
"step": 135
},
{
"epoch": 0.9315068493150684,
"grad_norm": 0.030791726162161644,
"learning_rate": 6.476503749166904e-05,
"loss": 0.0844,
"step": 136
},
{
"epoch": 0.9383561643835616,
"grad_norm": 0.03101593164912545,
"learning_rate": 6.419118609596948e-05,
"loss": 0.0743,
"step": 137
},
{
"epoch": 0.9452054794520548,
"grad_norm": 0.037483812819514725,
"learning_rate": 6.361529432216559e-05,
"loss": 0.0888,
"step": 138
},
{
"epoch": 0.952054794520548,
"grad_norm": 0.034654276627226706,
"learning_rate": 6.303744497073352e-05,
"loss": 0.0662,
"step": 139
},
{
"epoch": 0.958904109589041,
"grad_norm": 0.10049031991880263,
"learning_rate": 6.245772112360568e-05,
"loss": 0.0817,
"step": 140
},
{
"epoch": 0.9657534246575342,
"grad_norm": 0.03537251059587243,
"learning_rate": 6.187620613222544e-05,
"loss": 0.0768,
"step": 141
},
{
"epoch": 0.9726027397260274,
"grad_norm": 0.04616032239868132,
"learning_rate": 6.129298360556304e-05,
"loss": 0.0802,
"step": 142
},
{
"epoch": 0.9794520547945206,
"grad_norm": 0.03412880356569236,
"learning_rate": 6.070813739809442e-05,
"loss": 0.0574,
"step": 143
},
{
"epoch": 0.9863013698630136,
"grad_norm": 0.02909505969006464,
"learning_rate": 6.012175159774488e-05,
"loss": 0.0604,
"step": 144
},
{
"epoch": 0.9931506849315068,
"grad_norm": 0.03935910262269513,
"learning_rate": 5.953391051379904e-05,
"loss": 0.0584,
"step": 145
},
{
"epoch": 1.0,
"grad_norm": 0.03397983912672022,
"learning_rate": 5.894469866477905e-05,
"loss": 0.0577,
"step": 146
},
{
"epoch": 1.0068493150684932,
"grad_norm": 0.0311452791456704,
"learning_rate": 5.8354200766292734e-05,
"loss": 0.079,
"step": 147
},
{
"epoch": 1.0136986301369864,
"grad_norm": 0.033811357449414096,
"learning_rate": 5.776250171885329e-05,
"loss": 0.0656,
"step": 148
},
{
"epoch": 1.0205479452054795,
"grad_norm": 0.040640212988149904,
"learning_rate": 5.716968659567256e-05,
"loss": 0.0728,
"step": 149
},
{
"epoch": 1.0273972602739727,
"grad_norm": 0.03057953268248259,
"learning_rate": 5.6575840630429286e-05,
"loss": 0.0733,
"step": 150
},
{
"epoch": 1.0342465753424657,
"grad_norm": 0.032703562266556005,
"learning_rate": 5.5981049205014546e-05,
"loss": 0.0712,
"step": 151
},
{
"epoch": 1.0410958904109588,
"grad_norm": 0.03210664170957404,
"learning_rate": 5.5385397837255556e-05,
"loss": 0.0691,
"step": 152
},
{
"epoch": 1.047945205479452,
"grad_norm": 0.03471378475737469,
"learning_rate": 5.4788972168620255e-05,
"loss": 0.0757,
"step": 153
},
{
"epoch": 1.0547945205479452,
"grad_norm": 0.03165599022030969,
"learning_rate": 5.4191857951903826e-05,
"loss": 0.0597,
"step": 154
},
{
"epoch": 1.0616438356164384,
"grad_norm": 0.05451608731514093,
"learning_rate": 5.359414103889947e-05,
"loss": 0.0894,
"step": 155
},
{
"epoch": 1.0684931506849316,
"grad_norm": 0.04358798536487321,
"learning_rate": 5.29959073680547e-05,
"loss": 0.0782,
"step": 156
},
{
"epoch": 1.0753424657534247,
"grad_norm": 0.032277506655898194,
"learning_rate": 5.239724295211541e-05,
"loss": 0.0704,
"step": 157
},
{
"epoch": 1.0821917808219177,
"grad_norm": 0.028670588244485373,
"learning_rate": 5.179823386575907e-05,
"loss": 0.0507,
"step": 158
},
{
"epoch": 1.0890410958904109,
"grad_norm": 0.03036746716980511,
"learning_rate": 5.119896623321909e-05,
"loss": 0.0527,
"step": 159
},
{
"epoch": 1.095890410958904,
"grad_norm": 0.03329805529241328,
"learning_rate": 5.059952621590216e-05,
"loss": 0.0623,
"step": 160
},
{
"epoch": 1.1027397260273972,
"grad_norm": 0.037603460102786174,
"learning_rate": 5e-05,
"loss": 0.0695,
"step": 161
},
{
"epoch": 1.1095890410958904,
"grad_norm": 0.04120763652112312,
"learning_rate": 4.940047378409786e-05,
"loss": 0.0711,
"step": 162
},
{
"epoch": 1.1164383561643836,
"grad_norm": 0.0415019727955413,
"learning_rate": 4.880103376678092e-05,
"loss": 0.0638,
"step": 163
},
{
"epoch": 1.1232876712328768,
"grad_norm": 0.032850408356261955,
"learning_rate": 4.820176613424095e-05,
"loss": 0.0502,
"step": 164
},
{
"epoch": 1.13013698630137,
"grad_norm": 0.051682446059570064,
"learning_rate": 4.7602757047884595e-05,
"loss": 0.0658,
"step": 165
},
{
"epoch": 1.1369863013698631,
"grad_norm": 0.03126729469626143,
"learning_rate": 4.700409263194531e-05,
"loss": 0.0506,
"step": 166
},
{
"epoch": 1.143835616438356,
"grad_norm": 0.03940644676794397,
"learning_rate": 4.640585896110054e-05,
"loss": 0.0632,
"step": 167
},
{
"epoch": 1.1506849315068493,
"grad_norm": 0.04126874600149052,
"learning_rate": 4.580814204809618e-05,
"loss": 0.0813,
"step": 168
},
{
"epoch": 1.1575342465753424,
"grad_norm": 0.04156579483287442,
"learning_rate": 4.5211027831379757e-05,
"loss": 0.0652,
"step": 169
},
{
"epoch": 1.1643835616438356,
"grad_norm": 0.03391355375732456,
"learning_rate": 4.461460216274445e-05,
"loss": 0.057,
"step": 170
},
{
"epoch": 1.1712328767123288,
"grad_norm": 0.038632199419011234,
"learning_rate": 4.401895079498547e-05,
"loss": 0.0661,
"step": 171
},
{
"epoch": 1.178082191780822,
"grad_norm": 0.03708167445939923,
"learning_rate": 4.3424159369570725e-05,
"loss": 0.0624,
"step": 172
},
{
"epoch": 1.1849315068493151,
"grad_norm": 0.03753946698450887,
"learning_rate": 4.283031340432747e-05,
"loss": 0.0612,
"step": 173
},
{
"epoch": 1.191780821917808,
"grad_norm": 0.03537905440706239,
"learning_rate": 4.223749828114672e-05,
"loss": 0.0673,
"step": 174
},
{
"epoch": 1.1986301369863013,
"grad_norm": 0.03940327075341433,
"learning_rate": 4.1645799233707284e-05,
"loss": 0.0607,
"step": 175
},
{
"epoch": 1.2054794520547945,
"grad_norm": 0.041138452676237995,
"learning_rate": 4.1055301335220955e-05,
"loss": 0.0757,
"step": 176
},
{
"epoch": 1.2123287671232876,
"grad_norm": 0.03578703324326037,
"learning_rate": 4.0466089486200976e-05,
"loss": 0.0707,
"step": 177
},
{
"epoch": 1.2191780821917808,
"grad_norm": 0.039832365736742495,
"learning_rate": 3.987824840225512e-05,
"loss": 0.071,
"step": 178
},
{
"epoch": 1.226027397260274,
"grad_norm": 0.038723411367421474,
"learning_rate": 3.9291862601905595e-05,
"loss": 0.0728,
"step": 179
},
{
"epoch": 1.2328767123287672,
"grad_norm": 0.04498308725984581,
"learning_rate": 3.870701639443698e-05,
"loss": 0.0703,
"step": 180
},
{
"epoch": 1.2397260273972603,
"grad_norm": 0.031865141682931065,
"learning_rate": 3.812379386777457e-05,
"loss": 0.0523,
"step": 181
},
{
"epoch": 1.2465753424657535,
"grad_norm": 0.038277951507247616,
"learning_rate": 3.7542278876394336e-05,
"loss": 0.06,
"step": 182
},
{
"epoch": 1.2534246575342465,
"grad_norm": 0.0650327431598633,
"learning_rate": 3.696255502926649e-05,
"loss": 0.0586,
"step": 183
},
{
"epoch": 1.2602739726027397,
"grad_norm": 0.041491341553849696,
"learning_rate": 3.638470567783442e-05,
"loss": 0.0741,
"step": 184
},
{
"epoch": 1.2671232876712328,
"grad_norm": 0.03898025654685055,
"learning_rate": 3.580881390403052e-05,
"loss": 0.07,
"step": 185
},
{
"epoch": 1.273972602739726,
"grad_norm": 0.03741545061342083,
"learning_rate": 3.5234962508330974e-05,
"loss": 0.0648,
"step": 186
},
{
"epoch": 1.2808219178082192,
"grad_norm": 0.03579074437335611,
"learning_rate": 3.466323399785072e-05,
"loss": 0.0519,
"step": 187
},
{
"epoch": 1.2876712328767124,
"grad_norm": 0.03622522476818763,
"learning_rate": 3.409371057448092e-05,
"loss": 0.0581,
"step": 188
},
{
"epoch": 1.2945205479452055,
"grad_norm": 0.05446559861941158,
"learning_rate": 3.352647412307002e-05,
"loss": 0.0714,
"step": 189
},
{
"epoch": 1.3013698630136985,
"grad_norm": 0.03832700455471586,
"learning_rate": 3.296160619965056e-05,
"loss": 0.055,
"step": 190
},
{
"epoch": 1.308219178082192,
"grad_norm": 0.03896050037792126,
"learning_rate": 3.239918801971332e-05,
"loss": 0.062,
"step": 191
},
{
"epoch": 1.3150684931506849,
"grad_norm": 0.03267778522774074,
"learning_rate": 3.183930044653014e-05,
"loss": 0.0546,
"step": 192
},
{
"epoch": 1.321917808219178,
"grad_norm": 0.04339482297121572,
"learning_rate": 3.1282023979527805e-05,
"loss": 0.0676,
"step": 193
},
{
"epoch": 1.3287671232876712,
"grad_norm": 0.04149269813272275,
"learning_rate": 3.072743874271376e-05,
"loss": 0.0576,
"step": 194
},
{
"epoch": 1.3356164383561644,
"grad_norm": 0.05669408827237824,
"learning_rate": 3.0175624473156316e-05,
"loss": 0.0559,
"step": 195
},
{
"epoch": 1.3424657534246576,
"grad_norm": 0.04316637345791664,
"learning_rate": 2.962666050951997e-05,
"loss": 0.067,
"step": 196
},
{
"epoch": 1.3493150684931507,
"grad_norm": 0.04179017639942075,
"learning_rate": 2.9080625780658455e-05,
"loss": 0.0585,
"step": 197
},
{
"epoch": 1.356164383561644,
"grad_norm": 0.04086829863785544,
"learning_rate": 2.853759879426644e-05,
"loss": 0.0637,
"step": 198
},
{
"epoch": 1.3630136986301369,
"grad_norm": 0.045447600350027706,
"learning_rate": 2.7997657625591867e-05,
"loss": 0.0578,
"step": 199
},
{
"epoch": 1.36986301369863,
"grad_norm": 0.04063023817685305,
"learning_rate": 2.7460879906210487e-05,
"loss": 0.0704,
"step": 200
},
{
"epoch": 1.36986301369863,
"eval_loss": 0.07383698970079422,
"eval_runtime": 6.4837,
"eval_samples_per_second": 0.925,
"eval_steps_per_second": 0.308,
"step": 200
},
{
"epoch": 1.3767123287671232,
"grad_norm": 0.043841411312286985,
"learning_rate": 2.6927342812864116e-05,
"loss": 0.0667,
"step": 201
},
{
"epoch": 1.3835616438356164,
"grad_norm": 0.04301017602851657,
"learning_rate": 2.6397123056364365e-05,
"loss": 0.0726,
"step": 202
},
{
"epoch": 1.3904109589041096,
"grad_norm": 0.042839496066386186,
"learning_rate": 2.5870296870563286e-05,
"loss": 0.0602,
"step": 203
},
{
"epoch": 1.3972602739726028,
"grad_norm": 0.03829564126132437,
"learning_rate": 2.5346940001392728e-05,
"loss": 0.0645,
"step": 204
},
{
"epoch": 1.404109589041096,
"grad_norm": 0.03507625071131985,
"learning_rate": 2.482712769597363e-05,
"loss": 0.0543,
"step": 205
},
{
"epoch": 1.410958904109589,
"grad_norm": 0.03743875489488463,
"learning_rate": 2.4310934691797203e-05,
"loss": 0.0602,
"step": 206
},
{
"epoch": 1.4178082191780823,
"grad_norm": 0.037325448164000144,
"learning_rate": 2.379843520597937e-05,
"loss": 0.056,
"step": 207
},
{
"epoch": 1.4246575342465753,
"grad_norm": 0.04519550162385289,
"learning_rate": 2.3289702924589914e-05,
"loss": 0.0823,
"step": 208
},
{
"epoch": 1.4315068493150684,
"grad_norm": 0.041415718432946946,
"learning_rate": 2.2784810992058154e-05,
"loss": 0.069,
"step": 209
},
{
"epoch": 1.4383561643835616,
"grad_norm": 0.04615134339463282,
"learning_rate": 2.22838320006563e-05,
"loss": 0.0709,
"step": 210
},
{
"epoch": 1.4452054794520548,
"grad_norm": 0.03508689289745425,
"learning_rate": 2.1786837980062342e-05,
"loss": 0.0562,
"step": 211
},
{
"epoch": 1.452054794520548,
"grad_norm": 0.042082412191430475,
"learning_rate": 2.129390038700374e-05,
"loss": 0.0711,
"step": 212
},
{
"epoch": 1.4589041095890412,
"grad_norm": 0.04921460988289041,
"learning_rate": 2.0805090094983636e-05,
"loss": 0.066,
"step": 213
},
{
"epoch": 1.4657534246575343,
"grad_norm": 0.03476195821739949,
"learning_rate": 2.0320477384090665e-05,
"loss": 0.0516,
"step": 214
},
{
"epoch": 1.4726027397260273,
"grad_norm": 0.03974475045265387,
"learning_rate": 1.9840131930894333e-05,
"loss": 0.0507,
"step": 215
},
{
"epoch": 1.4794520547945205,
"grad_norm": 0.04495458260995571,
"learning_rate": 1.936412279842705e-05,
"loss": 0.071,
"step": 216
},
{
"epoch": 1.4863013698630136,
"grad_norm": 0.032618789951219244,
"learning_rate": 1.8892518426254364e-05,
"loss": 0.0487,
"step": 217
},
{
"epoch": 1.4931506849315068,
"grad_norm": 0.05233338159738159,
"learning_rate": 1.842538662063496e-05,
"loss": 0.0779,
"step": 218
},
{
"epoch": 1.5,
"grad_norm": 0.043093246770756745,
"learning_rate": 1.7962794544771477e-05,
"loss": 0.0731,
"step": 219
},
{
"epoch": 1.5068493150684932,
"grad_norm": 0.04778965621357637,
"learning_rate": 1.7504808709154104e-05,
"loss": 0.0767,
"step": 220
},
{
"epoch": 1.5136986301369864,
"grad_norm": 0.04225455780801827,
"learning_rate": 1.705149496199762e-05,
"loss": 0.065,
"step": 221
},
{
"epoch": 1.5205479452054793,
"grad_norm": 0.03777201213330916,
"learning_rate": 1.6602918479774148e-05,
"loss": 0.0587,
"step": 222
},
{
"epoch": 1.5273972602739727,
"grad_norm": 0.050670841355552755,
"learning_rate": 1.6159143757842004e-05,
"loss": 0.0694,
"step": 223
},
{
"epoch": 1.5342465753424657,
"grad_norm": 0.04169608777354125,
"learning_rate": 1.5720234601172766e-05,
"loss": 0.0622,
"step": 224
},
{
"epoch": 1.541095890410959,
"grad_norm": 0.0399325959403793,
"learning_rate": 1.5286254115177623e-05,
"loss": 0.0559,
"step": 225
},
{
"epoch": 1.547945205479452,
"grad_norm": 0.04347771997206968,
"learning_rate": 1.485726469663401e-05,
"loss": 0.0625,
"step": 226
},
{
"epoch": 1.5547945205479452,
"grad_norm": 0.04329191462577287,
"learning_rate": 1.4433328024714581e-05,
"loss": 0.0682,
"step": 227
},
{
"epoch": 1.5616438356164384,
"grad_norm": 0.045935945373833685,
"learning_rate": 1.4014505052118892e-05,
"loss": 0.063,
"step": 228
},
{
"epoch": 1.5684931506849316,
"grad_norm": 0.03953171472704261,
"learning_rate": 1.3600855996309935e-05,
"loss": 0.0621,
"step": 229
},
{
"epoch": 1.5753424657534247,
"grad_norm": 0.042842743444326405,
"learning_rate": 1.3192440330856004e-05,
"loss": 0.0584,
"step": 230
},
{
"epoch": 1.5821917808219177,
"grad_norm": 0.04147136350311881,
"learning_rate": 1.2789316776879939e-05,
"loss": 0.0571,
"step": 231
},
{
"epoch": 1.589041095890411,
"grad_norm": 0.041096032631508496,
"learning_rate": 1.2391543294616147e-05,
"loss": 0.0569,
"step": 232
},
{
"epoch": 1.595890410958904,
"grad_norm": 0.04826360135309353,
"learning_rate": 1.1999177075077278e-05,
"loss": 0.0749,
"step": 233
},
{
"epoch": 1.6027397260273972,
"grad_norm": 0.034935721211031806,
"learning_rate": 1.1612274531831463e-05,
"loss": 0.0514,
"step": 234
},
{
"epoch": 1.6095890410958904,
"grad_norm": 0.04282701283898458,
"learning_rate": 1.123089129289117e-05,
"loss": 0.0671,
"step": 235
},
{
"epoch": 1.6164383561643836,
"grad_norm": 0.05961503389381486,
"learning_rate": 1.0855082192715294e-05,
"loss": 0.0665,
"step": 236
},
{
"epoch": 1.6232876712328768,
"grad_norm": 0.06432557444245549,
"learning_rate": 1.0484901264325025e-05,
"loss": 0.0755,
"step": 237
},
{
"epoch": 1.6301369863013697,
"grad_norm": 0.04253716073284135,
"learning_rate": 1.0120401731535212e-05,
"loss": 0.0733,
"step": 238
},
{
"epoch": 1.6369863013698631,
"grad_norm": 0.0774475760401046,
"learning_rate": 9.761636001301871e-06,
"loss": 0.065,
"step": 239
},
{
"epoch": 1.643835616438356,
"grad_norm": 0.03996033149580056,
"learning_rate": 9.408655656187282e-06,
"loss": 0.065,
"step": 240
},
{
"epoch": 1.6506849315068495,
"grad_norm": 0.03785205141208416,
"learning_rate": 9.061511446943533e-06,
"loss": 0.0548,
"step": 241
},
{
"epoch": 1.6575342465753424,
"grad_norm": 0.03850363195978434,
"learning_rate": 8.720253285215685e-06,
"loss": 0.0587,
"step": 242
},
{
"epoch": 1.6643835616438356,
"grad_norm": 0.0469269115044975,
"learning_rate": 8.384930236365629e-06,
"loss": 0.0634,
"step": 243
},
{
"epoch": 1.6712328767123288,
"grad_norm": 0.0498361427852067,
"learning_rate": 8.0555905124175e-06,
"loss": 0.051,
"step": 244
},
{
"epoch": 1.678082191780822,
"grad_norm": 0.04702037163211539,
"learning_rate": 7.732281465125907e-06,
"loss": 0.0677,
"step": 245
},
{
"epoch": 1.6849315068493151,
"grad_norm": 0.04724045679820981,
"learning_rate": 7.415049579167782e-06,
"loss": 0.0731,
"step": 246
},
{
"epoch": 1.691780821917808,
"grad_norm": 0.04960546509274232,
"learning_rate": 7.103940465458936e-06,
"loss": 0.0777,
"step": 247
},
{
"epoch": 1.6986301369863015,
"grad_norm": 0.04749987160293405,
"learning_rate": 6.798998854596189e-06,
"loss": 0.0531,
"step": 248
},
{
"epoch": 1.7054794520547945,
"grad_norm": 0.04656027534681338,
"learning_rate": 6.500268590426106e-06,
"loss": 0.0547,
"step": 249
},
{
"epoch": 1.7123287671232876,
"grad_norm": 0.043116663189771226,
"learning_rate": 6.207792623741249e-06,
"loss": 0.0677,
"step": 250
},
{
"epoch": 1.7191780821917808,
"grad_norm": 0.04714594190332252,
"learning_rate": 5.9216130061047646e-06,
"loss": 0.068,
"step": 251
},
{
"epoch": 1.726027397260274,
"grad_norm": 0.0442482453639658,
"learning_rate": 5.641770883804365e-06,
"loss": 0.0602,
"step": 252
},
{
"epoch": 1.7328767123287672,
"grad_norm": 0.044456632129314805,
"learning_rate": 5.368306491936325e-06,
"loss": 0.0661,
"step": 253
},
{
"epoch": 1.7397260273972601,
"grad_norm": 0.04864641601498424,
"learning_rate": 5.101259148620619e-06,
"loss": 0.0676,
"step": 254
},
{
"epoch": 1.7465753424657535,
"grad_norm": 0.043784015189862524,
"learning_rate": 4.840667249347824e-06,
"loss": 0.0669,
"step": 255
},
{
"epoch": 1.7534246575342465,
"grad_norm": 0.043216466288645235,
"learning_rate": 4.586568261458729e-06,
"loss": 0.0644,
"step": 256
},
{
"epoch": 1.7602739726027399,
"grad_norm": 0.042340118838538034,
"learning_rate": 4.3389987187573145e-06,
"loss": 0.053,
"step": 257
},
{
"epoch": 1.7671232876712328,
"grad_norm": 0.03965234038836836,
"learning_rate": 4.097994216258039e-06,
"loss": 0.0528,
"step": 258
},
{
"epoch": 1.773972602739726,
"grad_norm": 0.04330912545003313,
"learning_rate": 3.8635894050680466e-06,
"loss": 0.062,
"step": 259
},
{
"epoch": 1.7808219178082192,
"grad_norm": 0.04245993591175113,
"learning_rate": 3.63581798740511e-06,
"loss": 0.0682,
"step": 260
},
{
"epoch": 1.7876712328767124,
"grad_norm": 0.046454610509872485,
"learning_rate": 3.4147127117520104e-06,
"loss": 0.0773,
"step": 261
},
{
"epoch": 1.7945205479452055,
"grad_norm": 0.061061675638647096,
"learning_rate": 3.2003053681480098e-06,
"loss": 0.0628,
"step": 262
},
{
"epoch": 1.8013698630136985,
"grad_norm": 0.04251403620622773,
"learning_rate": 2.992626783618152e-06,
"loss": 0.0506,
"step": 263
},
{
"epoch": 1.808219178082192,
"grad_norm": 0.04783592253659306,
"learning_rate": 2.791706817741041e-06,
"loss": 0.0737,
"step": 264
},
{
"epoch": 1.8150684931506849,
"grad_norm": 0.04359969004920878,
"learning_rate": 2.59757435835567e-06,
"loss": 0.0599,
"step": 265
},
{
"epoch": 1.821917808219178,
"grad_norm": 0.044487631888004406,
"learning_rate": 2.41025731740801e-06,
"loss": 0.0692,
"step": 266
},
{
"epoch": 1.8287671232876712,
"grad_norm": 0.04510155943713286,
"learning_rate": 2.229782626937865e-06,
"loss": 0.0633,
"step": 267
},
{
"epoch": 1.8356164383561644,
"grad_norm": 0.04035438478741115,
"learning_rate": 2.056176235206664e-06,
"loss": 0.0601,
"step": 268
},
{
"epoch": 1.8424657534246576,
"grad_norm": 0.046645570169377086,
"learning_rate": 1.889463102966671e-06,
"loss": 0.0671,
"step": 269
},
{
"epoch": 1.8493150684931505,
"grad_norm": 0.045925921341422184,
"learning_rate": 1.729667199872187e-06,
"loss": 0.0796,
"step": 270
},
{
"epoch": 1.856164383561644,
"grad_norm": 0.047745127307766096,
"learning_rate": 1.5768115010332208e-06,
"loss": 0.0681,
"step": 271
},
{
"epoch": 1.8630136986301369,
"grad_norm": 0.04828726659252653,
"learning_rate": 1.4309179837122044e-06,
"loss": 0.0637,
"step": 272
},
{
"epoch": 1.8698630136986303,
"grad_norm": 0.03831444871977773,
"learning_rate": 1.2920076241641376e-06,
"loss": 0.0537,
"step": 273
},
{
"epoch": 1.8767123287671232,
"grad_norm": 0.043143594829470666,
"learning_rate": 1.1601003946206724e-06,
"loss": 0.0645,
"step": 274
},
{
"epoch": 1.8835616438356164,
"grad_norm": 0.04108818302281154,
"learning_rate": 1.0352152604185428e-06,
"loss": 0.0596,
"step": 275
},
{
"epoch": 1.8904109589041096,
"grad_norm": 0.04419177931878111,
"learning_rate": 9.17370177272775e-07,
"loss": 0.0681,
"step": 276
},
{
"epoch": 1.8972602739726028,
"grad_norm": 0.04544816624943878,
"learning_rate": 8.065820886950404e-07,
"loss": 0.0665,
"step": 277
},
{
"epoch": 1.904109589041096,
"grad_norm": 0.044560046311441934,
"learning_rate": 7.028669235575714e-07,
"loss": 0.0665,
"step": 278
},
{
"epoch": 1.910958904109589,
"grad_norm": 0.046336508445476945,
"learning_rate": 6.062395938029485e-07,
"loss": 0.0715,
"step": 279
},
{
"epoch": 1.9178082191780823,
"grad_norm": 0.046783520939667735,
"learning_rate": 5.167139923000553e-07,
"loss": 0.0671,
"step": 280
},
{
"epoch": 1.9246575342465753,
"grad_norm": 0.045134549211076444,
"learning_rate": 4.343029908466301e-07,
"loss": 0.0651,
"step": 281
},
{
"epoch": 1.9315068493150684,
"grad_norm": 0.04620102733800356,
"learning_rate": 3.5901843831857576e-07,
"loss": 0.0611,
"step": 282
},
{
"epoch": 1.9383561643835616,
"grad_norm": 0.04290459317190587,
"learning_rate": 2.908711589663549e-07,
"loss": 0.0688,
"step": 283
},
{
"epoch": 1.9452054794520548,
"grad_norm": 0.0459065259141636,
"learning_rate": 2.2987095085867937e-07,
"loss": 0.0627,
"step": 284
},
{
"epoch": 1.952054794520548,
"grad_norm": 0.062357302229075894,
"learning_rate": 1.760265844738096e-07,
"loss": 0.0792,
"step": 285
},
{
"epoch": 1.958904109589041,
"grad_norm": 0.044164802827650715,
"learning_rate": 1.2934580143851295e-07,
"loss": 0.0601,
"step": 286
},
{
"epoch": 1.9657534246575343,
"grad_norm": 0.03615207612143871,
"learning_rate": 8.983531341500983e-08,
"loss": 0.0512,
"step": 287
},
{
"epoch": 1.9726027397260273,
"grad_norm": 0.04362072520783342,
"learning_rate": 5.750080113598455e-08,
"loss": 0.0616,
"step": 288
},
{
"epoch": 1.9794520547945207,
"grad_norm": 0.04831491776296162,
"learning_rate": 3.2346913587816275e-08,
"loss": 0.0731,
"step": 289
},
{
"epoch": 1.9863013698630136,
"grad_norm": 0.050700245606893124,
"learning_rate": 1.4377267342158274e-08,
"loss": 0.0733,
"step": 290
},
{
"epoch": 1.9931506849315068,
"grad_norm": 0.04317508790841958,
"learning_rate": 3.594446035964927e-09,
"loss": 0.054,
"step": 291
},
{
"epoch": 2.0,
"grad_norm": 0.05236597882946408,
"learning_rate": 0.0,
"loss": 0.0588,
"step": 292
},
{
"epoch": 2.0,
"step": 292,
"total_flos": 739456868941824.0,
"train_loss": 0.07426239744032899,
"train_runtime": 2224.6089,
"train_samples_per_second": 0.523,
"train_steps_per_second": 0.131
}
],
"logging_steps": 1,
"max_steps": 292,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 739456868941824.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}