Llama3.1-Mamba-8B-dpo / trainer_state.json
Junxiong Wang
add models
5009c6e
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 5000,
"global_step": 4168,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002399232245681382,
"grad_norm": 3.9086405364003305,
"learning_rate": 1.199040767386091e-09,
"logits/chosen": -0.9392852187156677,
"logits/rejected": -0.9925774335861206,
"logps/chosen": -164.85171508789062,
"logps/rejected": -169.34266662597656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0023992322456813818,
"grad_norm": 4.318184225673836,
"learning_rate": 1.199040767386091e-08,
"logits/chosen": -0.8653285503387451,
"logits/rejected": -1.0646977424621582,
"logps/chosen": -367.5494384765625,
"logps/rejected": -308.0057067871094,
"loss": 0.6931,
"rewards/accuracies": 0.3611111044883728,
"rewards/chosen": 0.00055171106941998,
"rewards/margins": 0.00021127487707417458,
"rewards/rejected": 0.0003404362651053816,
"step": 10
},
{
"epoch": 0.0047984644913627635,
"grad_norm": 4.384399942785772,
"learning_rate": 2.398081534772182e-08,
"logits/chosen": -0.9145099520683289,
"logits/rejected": -0.9615824818611145,
"logps/chosen": -254.70645141601562,
"logps/rejected": -225.65023803710938,
"loss": 0.6933,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.0004928931593894958,
"rewards/margins": 0.0004294395330362022,
"rewards/rejected": 6.345368456095457e-05,
"step": 20
},
{
"epoch": 0.007197696737044146,
"grad_norm": 4.1919489271249395,
"learning_rate": 3.597122302158273e-08,
"logits/chosen": -1.0393908023834229,
"logits/rejected": -1.1211938858032227,
"logps/chosen": -247.6179962158203,
"logps/rejected": -250.74832153320312,
"loss": 0.6931,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.0005728682735934854,
"rewards/margins": -0.0005012283218093216,
"rewards/rejected": -7.164000999182463e-05,
"step": 30
},
{
"epoch": 0.009596928982725527,
"grad_norm": 4.043349234918003,
"learning_rate": 4.796163069544364e-08,
"logits/chosen": -1.0382745265960693,
"logits/rejected": -1.1404989957809448,
"logps/chosen": -246.5960693359375,
"logps/rejected": -238.99038696289062,
"loss": 0.6933,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.0003935880376957357,
"rewards/margins": 0.0007454471779055893,
"rewards/rejected": -0.001139035215601325,
"step": 40
},
{
"epoch": 0.01199616122840691,
"grad_norm": 4.337621377747828,
"learning_rate": 5.995203836930455e-08,
"logits/chosen": -0.9566876292228699,
"logits/rejected": -1.0265729427337646,
"logps/chosen": -273.5587463378906,
"logps/rejected": -238.2271728515625,
"loss": 0.6931,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.0002776079345494509,
"rewards/margins": -0.0013704797020182014,
"rewards/rejected": 0.0010928716510534286,
"step": 50
},
{
"epoch": 0.014395393474088292,
"grad_norm": 4.332693802131573,
"learning_rate": 7.194244604316546e-08,
"logits/chosen": -1.14139723777771,
"logits/rejected": -1.063253402709961,
"logps/chosen": -291.4471130371094,
"logps/rejected": -265.26800537109375,
"loss": 0.6931,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.0003930005768779665,
"rewards/margins": 0.00029442697996273637,
"rewards/rejected": 9.857374243438244e-05,
"step": 60
},
{
"epoch": 0.016794625719769675,
"grad_norm": 3.9392376744722797,
"learning_rate": 8.393285371702638e-08,
"logits/chosen": -0.7830671072006226,
"logits/rejected": -0.8284071087837219,
"logps/chosen": -280.4967346191406,
"logps/rejected": -269.8634033203125,
"loss": 0.6934,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.00010640527762006968,
"rewards/margins": -0.00039604370249435306,
"rewards/rejected": 0.00028963852673768997,
"step": 70
},
{
"epoch": 0.019193857965451054,
"grad_norm": 4.275834970816185,
"learning_rate": 9.592326139088728e-08,
"logits/chosen": -1.1247626543045044,
"logits/rejected": -0.8464676141738892,
"logps/chosen": -203.01101684570312,
"logps/rejected": -241.64547729492188,
"loss": 0.6931,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.00033823197009041905,
"rewards/margins": 0.0008404625696130097,
"rewards/rejected": -0.0005022305413149297,
"step": 80
},
{
"epoch": 0.021593090211132437,
"grad_norm": 4.009980090025205,
"learning_rate": 1.0791366906474819e-07,
"logits/chosen": -1.128251552581787,
"logits/rejected": -1.1966060400009155,
"logps/chosen": -348.4684143066406,
"logps/rejected": -300.92156982421875,
"loss": 0.6931,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": 0.00025963489315472543,
"rewards/margins": 0.0006502953474409878,
"rewards/rejected": -0.0003906603087671101,
"step": 90
},
{
"epoch": 0.02399232245681382,
"grad_norm": 4.278362574458803,
"learning_rate": 1.199040767386091e-07,
"logits/chosen": -0.8752719759941101,
"logits/rejected": -0.7615184783935547,
"logps/chosen": -262.26171875,
"logps/rejected": -279.4682312011719,
"loss": 0.6929,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.0001237258838955313,
"rewards/margins": -0.00013651838526129723,
"rewards/rejected": 1.2792646884918213e-05,
"step": 100
},
{
"epoch": 0.026391554702495202,
"grad_norm": 3.7292949735641874,
"learning_rate": 1.3189448441247004e-07,
"logits/chosen": -1.054966688156128,
"logits/rejected": -1.089815616607666,
"logps/chosen": -232.7165069580078,
"logps/rejected": -230.30648803710938,
"loss": 0.6926,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.0005149660282768309,
"rewards/margins": 0.0010742491576820612,
"rewards/rejected": -0.0015892151277512312,
"step": 110
},
{
"epoch": 0.028790786948176585,
"grad_norm": 4.185972544798998,
"learning_rate": 1.4388489208633092e-07,
"logits/chosen": -0.9251031875610352,
"logits/rejected": -1.0560011863708496,
"logps/chosen": -302.79620361328125,
"logps/rejected": -279.6351013183594,
"loss": 0.6928,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.0033786073327064514,
"rewards/margins": -0.0015979796880856156,
"rewards/rejected": -0.0017806284595280886,
"step": 120
},
{
"epoch": 0.031190019193857964,
"grad_norm": 3.7577616139381282,
"learning_rate": 1.5587529976019183e-07,
"logits/chosen": -1.1069813966751099,
"logits/rejected": -1.0163028240203857,
"logps/chosen": -225.87887573242188,
"logps/rejected": -308.16943359375,
"loss": 0.6922,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.0021240042988210917,
"rewards/margins": 0.0019912621937692165,
"rewards/rejected": -0.004115266725420952,
"step": 130
},
{
"epoch": 0.03358925143953935,
"grad_norm": 3.9602176902490616,
"learning_rate": 1.6786570743405277e-07,
"logits/chosen": -0.8096126317977905,
"logits/rejected": -0.844383716583252,
"logps/chosen": -278.711181640625,
"logps/rejected": -270.23455810546875,
"loss": 0.6913,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.0016858462477102876,
"rewards/margins": 0.004555505700409412,
"rewards/rejected": -0.0062413522973656654,
"step": 140
},
{
"epoch": 0.03598848368522073,
"grad_norm": 4.198772547754269,
"learning_rate": 1.7985611510791365e-07,
"logits/chosen": -1.0384037494659424,
"logits/rejected": -1.0555726289749146,
"logps/chosen": -231.3898468017578,
"logps/rejected": -225.4952392578125,
"loss": 0.6919,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.0014209033688530326,
"rewards/margins": 0.00509048905223608,
"rewards/rejected": -0.006511392537504435,
"step": 150
},
{
"epoch": 0.03838771593090211,
"grad_norm": 4.167240538791073,
"learning_rate": 1.9184652278177456e-07,
"logits/chosen": -0.8518667221069336,
"logits/rejected": -0.9568248987197876,
"logps/chosen": -296.21734619140625,
"logps/rejected": -231.2320098876953,
"loss": 0.6906,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.004124562256038189,
"rewards/margins": 0.004930226132273674,
"rewards/rejected": -0.009054789319634438,
"step": 160
},
{
"epoch": 0.040786948176583494,
"grad_norm": 3.692310549028015,
"learning_rate": 2.038369304556355e-07,
"logits/chosen": -0.8354592323303223,
"logits/rejected": -0.8758047819137573,
"logps/chosen": -342.7477111816406,
"logps/rejected": -333.38189697265625,
"loss": 0.6903,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.004680985119193792,
"rewards/margins": 0.004498300142586231,
"rewards/rejected": -0.009179284796118736,
"step": 170
},
{
"epoch": 0.04318618042226487,
"grad_norm": 4.32405896978214,
"learning_rate": 2.1582733812949638e-07,
"logits/chosen": -1.1283237934112549,
"logits/rejected": -1.1168252229690552,
"logps/chosen": -238.8912353515625,
"logps/rejected": -229.00265502929688,
"loss": 0.6905,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.004386520944535732,
"rewards/margins": 0.007528006099164486,
"rewards/rejected": -0.011914527975022793,
"step": 180
},
{
"epoch": 0.04558541266794626,
"grad_norm": 4.666604305995101,
"learning_rate": 2.278177458033573e-07,
"logits/chosen": -0.9106446504592896,
"logits/rejected": -0.9879466891288757,
"logps/chosen": -306.4612121582031,
"logps/rejected": -249.0087890625,
"loss": 0.6894,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.006806717719882727,
"rewards/margins": 0.007770798169076443,
"rewards/rejected": -0.014577515423297882,
"step": 190
},
{
"epoch": 0.04798464491362764,
"grad_norm": 3.9918872126977574,
"learning_rate": 2.398081534772182e-07,
"logits/chosen": -0.9901522397994995,
"logits/rejected": -0.928848385810852,
"logps/chosen": -313.17681884765625,
"logps/rejected": -297.6922302246094,
"loss": 0.6885,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.010936335660517216,
"rewards/margins": 0.006564898882061243,
"rewards/rejected": -0.017501235008239746,
"step": 200
},
{
"epoch": 0.05038387715930902,
"grad_norm": 4.040620494299144,
"learning_rate": 2.517985611510791e-07,
"logits/chosen": -0.9027220606803894,
"logits/rejected": -0.922700047492981,
"logps/chosen": -230.9945831298828,
"logps/rejected": -255.6648712158203,
"loss": 0.6876,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.010718774050474167,
"rewards/margins": 0.009615534916520119,
"rewards/rejected": -0.020334308966994286,
"step": 210
},
{
"epoch": 0.052783109404990404,
"grad_norm": 4.002893355744946,
"learning_rate": 2.637889688249401e-07,
"logits/chosen": -0.8777509927749634,
"logits/rejected": -0.9541767239570618,
"logps/chosen": -312.22064208984375,
"logps/rejected": -314.44476318359375,
"loss": 0.6874,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.018286144360899925,
"rewards/margins": 0.010097065940499306,
"rewards/rejected": -0.02838321030139923,
"step": 220
},
{
"epoch": 0.05518234165067178,
"grad_norm": 4.397529514704007,
"learning_rate": 2.7577937649880093e-07,
"logits/chosen": -0.8843205571174622,
"logits/rejected": -0.7930720448493958,
"logps/chosen": -240.90969848632812,
"logps/rejected": -279.2537841796875,
"loss": 0.684,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.013729272410273552,
"rewards/margins": 0.02465725876390934,
"rewards/rejected": -0.03838653117418289,
"step": 230
},
{
"epoch": 0.05758157389635317,
"grad_norm": 4.676021615108152,
"learning_rate": 2.8776978417266184e-07,
"logits/chosen": -1.0245535373687744,
"logits/rejected": -1.0780936479568481,
"logps/chosen": -303.7603454589844,
"logps/rejected": -259.3138732910156,
"loss": 0.6808,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.010991424322128296,
"rewards/margins": 0.02684735879302025,
"rewards/rejected": -0.037838783115148544,
"step": 240
},
{
"epoch": 0.05998080614203455,
"grad_norm": 4.463678598202064,
"learning_rate": 2.997601918465228e-07,
"logits/chosen": -0.9612535238265991,
"logits/rejected": -1.0222301483154297,
"logps/chosen": -241.61404418945312,
"logps/rejected": -236.07644653320312,
"loss": 0.68,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.021920276805758476,
"rewards/margins": 0.018481746315956116,
"rewards/rejected": -0.04040202870965004,
"step": 250
},
{
"epoch": 0.06238003838771593,
"grad_norm": 4.115268408123004,
"learning_rate": 3.1175059952038366e-07,
"logits/chosen": -1.0142881870269775,
"logits/rejected": -0.8710586428642273,
"logps/chosen": -263.0195617675781,
"logps/rejected": -259.115478515625,
"loss": 0.6769,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.021990353241562843,
"rewards/margins": 0.03786135092377663,
"rewards/rejected": -0.05985169857740402,
"step": 260
},
{
"epoch": 0.0647792706333973,
"grad_norm": 4.201440552672297,
"learning_rate": 3.2374100719424457e-07,
"logits/chosen": -0.9422982931137085,
"logits/rejected": -1.1441442966461182,
"logps/chosen": -290.52044677734375,
"logps/rejected": -235.48049926757812,
"loss": 0.6748,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.028027933090925217,
"rewards/margins": 0.016811534762382507,
"rewards/rejected": -0.044839464128017426,
"step": 270
},
{
"epoch": 0.0671785028790787,
"grad_norm": 4.458570754919466,
"learning_rate": 3.3573141486810554e-07,
"logits/chosen": -1.0597388744354248,
"logits/rejected": -1.0107687711715698,
"logps/chosen": -299.5600891113281,
"logps/rejected": -287.019287109375,
"loss": 0.6676,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.03363611549139023,
"rewards/margins": 0.046159304678440094,
"rewards/rejected": -0.07979541271924973,
"step": 280
},
{
"epoch": 0.06957773512476008,
"grad_norm": 3.937791865544782,
"learning_rate": 3.477218225419664e-07,
"logits/chosen": -0.9224345088005066,
"logits/rejected": -0.8189966082572937,
"logps/chosen": -291.11492919921875,
"logps/rejected": -267.08172607421875,
"loss": 0.6674,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.04258224740624428,
"rewards/margins": 0.05275702476501465,
"rewards/rejected": -0.09533928334712982,
"step": 290
},
{
"epoch": 0.07197696737044146,
"grad_norm": 4.881987417549259,
"learning_rate": 3.597122302158273e-07,
"logits/chosen": -1.013051152229309,
"logits/rejected": -1.0528075695037842,
"logps/chosen": -260.43798828125,
"logps/rejected": -280.9106750488281,
"loss": 0.6638,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.061515528708696365,
"rewards/margins": 0.059713393449783325,
"rewards/rejected": -0.12122891843318939,
"step": 300
},
{
"epoch": 0.07437619961612284,
"grad_norm": 3.887121301077397,
"learning_rate": 3.7170263788968827e-07,
"logits/chosen": -0.8997815847396851,
"logits/rejected": -0.9898494482040405,
"logps/chosen": -277.71112060546875,
"logps/rejected": -239.8883514404297,
"loss": 0.6659,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.058370210230350494,
"rewards/margins": 0.08987968415021896,
"rewards/rejected": -0.14824989438056946,
"step": 310
},
{
"epoch": 0.07677543186180422,
"grad_norm": 3.767825032617774,
"learning_rate": 3.836930455635491e-07,
"logits/chosen": -1.0142638683319092,
"logits/rejected": -1.0835198163986206,
"logps/chosen": -283.21722412109375,
"logps/rejected": -256.9454040527344,
"loss": 0.6599,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.0824267640709877,
"rewards/margins": 0.06453671306371689,
"rewards/rejected": -0.1469634771347046,
"step": 320
},
{
"epoch": 0.07917466410748561,
"grad_norm": 4.101417109334993,
"learning_rate": 3.9568345323741003e-07,
"logits/chosen": -0.8915877342224121,
"logits/rejected": -0.7875005006790161,
"logps/chosen": -256.7735595703125,
"logps/rejected": -309.68743896484375,
"loss": 0.648,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.13278505206108093,
"rewards/margins": 0.12722721695899963,
"rewards/rejected": -0.26001226902008057,
"step": 330
},
{
"epoch": 0.08157389635316699,
"grad_norm": 4.184212161663267,
"learning_rate": 4.07673860911271e-07,
"logits/chosen": -0.8056583404541016,
"logits/rejected": -0.8734966516494751,
"logps/chosen": -257.9537353515625,
"logps/rejected": -290.716552734375,
"loss": 0.6385,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.15207326412200928,
"rewards/margins": 0.15626882016658783,
"rewards/rejected": -0.3083421289920807,
"step": 340
},
{
"epoch": 0.08397312859884837,
"grad_norm": 5.07860606766039,
"learning_rate": 4.1966426858513185e-07,
"logits/chosen": -1.1134240627288818,
"logits/rejected": -1.1105023622512817,
"logps/chosen": -308.1408386230469,
"logps/rejected": -322.1429748535156,
"loss": 0.6317,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.37560468912124634,
"rewards/margins": 0.14869387447834015,
"rewards/rejected": -0.5242985486984253,
"step": 350
},
{
"epoch": 0.08637236084452975,
"grad_norm": 6.681443438336797,
"learning_rate": 4.3165467625899276e-07,
"logits/chosen": -0.9821847677230835,
"logits/rejected": -1.1448824405670166,
"logps/chosen": -324.4971618652344,
"logps/rejected": -285.352294921875,
"loss": 0.6325,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.5944857001304626,
"rewards/margins": 0.16079989075660706,
"rewards/rejected": -0.7552856206893921,
"step": 360
},
{
"epoch": 0.08877159309021113,
"grad_norm": 5.547129015189758,
"learning_rate": 4.436450839328537e-07,
"logits/chosen": -0.9680454134941101,
"logits/rejected": -0.8957147598266602,
"logps/chosen": -285.62225341796875,
"logps/rejected": -329.6009826660156,
"loss": 0.6155,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.507738471031189,
"rewards/margins": 0.350941002368927,
"rewards/rejected": -0.858679473400116,
"step": 370
},
{
"epoch": 0.09117082533589252,
"grad_norm": 5.005904357466744,
"learning_rate": 4.556354916067146e-07,
"logits/chosen": -1.0903767347335815,
"logits/rejected": -1.0195515155792236,
"logps/chosen": -279.7075500488281,
"logps/rejected": -327.0304870605469,
"loss": 0.5837,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.42896413803100586,
"rewards/margins": 0.4645144045352936,
"rewards/rejected": -0.8934786915779114,
"step": 380
},
{
"epoch": 0.0935700575815739,
"grad_norm": 6.3368430821651085,
"learning_rate": 4.676258992805755e-07,
"logits/chosen": -0.9331681132316589,
"logits/rejected": -0.974704384803772,
"logps/chosen": -358.29052734375,
"logps/rejected": -354.2314147949219,
"loss": 0.6125,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9378200769424438,
"rewards/margins": 0.23260822892189026,
"rewards/rejected": -1.1704282760620117,
"step": 390
},
{
"epoch": 0.09596928982725528,
"grad_norm": 4.993194981103021,
"learning_rate": 4.796163069544364e-07,
"logits/chosen": -0.9397958517074585,
"logits/rejected": -1.0308669805526733,
"logps/chosen": -327.28485107421875,
"logps/rejected": -373.20538330078125,
"loss": 0.6133,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6638423800468445,
"rewards/margins": 0.6587511897087097,
"rewards/rejected": -1.3225935697555542,
"step": 400
},
{
"epoch": 0.09836852207293666,
"grad_norm": 5.627853897652041,
"learning_rate": 4.916067146282974e-07,
"logits/chosen": -1.0697743892669678,
"logits/rejected": -1.02415931224823,
"logps/chosen": -302.128173828125,
"logps/rejected": -373.0263366699219,
"loss": 0.5737,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.543385922908783,
"rewards/margins": 0.5250757932662964,
"rewards/rejected": -1.0684617757797241,
"step": 410
},
{
"epoch": 0.10076775431861804,
"grad_norm": 7.180880498197233,
"learning_rate": 4.999992108529978e-07,
"logits/chosen": -0.9345219731330872,
"logits/rejected": -0.9572717547416687,
"logps/chosen": -444.15997314453125,
"logps/rejected": -470.5298767089844,
"loss": 0.6004,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.090321660041809,
"rewards/margins": 0.609116792678833,
"rewards/rejected": -1.6994386911392212,
"step": 420
},
{
"epoch": 0.10316698656429943,
"grad_norm": 11.482376711377093,
"learning_rate": 4.999851817115532e-07,
"logits/chosen": -1.1403158903121948,
"logits/rejected": -1.0577712059020996,
"logps/chosen": -349.0794982910156,
"logps/rejected": -424.08642578125,
"loss": 0.598,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9095686078071594,
"rewards/margins": 0.863958477973938,
"rewards/rejected": -1.773526906967163,
"step": 430
},
{
"epoch": 0.10556621880998081,
"grad_norm": 5.363358928784856,
"learning_rate": 4.999536171027889e-07,
"logits/chosen": -0.8907009363174438,
"logits/rejected": -0.991034984588623,
"logps/chosen": -319.98046875,
"logps/rejected": -347.8736877441406,
"loss": 0.5812,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.6185662150382996,
"rewards/margins": 0.3348569869995117,
"rewards/rejected": -0.9534232020378113,
"step": 440
},
{
"epoch": 0.10796545105566219,
"grad_norm": 5.047279311794404,
"learning_rate": 4.999045192408369e-07,
"logits/chosen": -1.0067179203033447,
"logits/rejected": -1.0287652015686035,
"logps/chosen": -324.7417907714844,
"logps/rejected": -358.2411804199219,
"loss": 0.585,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.852948784828186,
"rewards/margins": 0.47164326906204224,
"rewards/rejected": -1.324592113494873,
"step": 450
},
{
"epoch": 0.11036468330134357,
"grad_norm": 11.251777016471365,
"learning_rate": 4.998378915697171e-07,
"logits/chosen": -0.9187090992927551,
"logits/rejected": -0.9824856519699097,
"logps/chosen": -357.00994873046875,
"logps/rejected": -421.2936096191406,
"loss": 0.5563,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7373157739639282,
"rewards/margins": 0.7148237824440002,
"rewards/rejected": -1.4521396160125732,
"step": 460
},
{
"epoch": 0.11276391554702495,
"grad_norm": 8.58525552938624,
"learning_rate": 4.997537387630958e-07,
"logits/chosen": -1.0368196964263916,
"logits/rejected": -1.0864078998565674,
"logps/chosen": -331.54705810546875,
"logps/rejected": -414.0245666503906,
"loss": 0.5324,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0620375871658325,
"rewards/margins": 0.7675203084945679,
"rewards/rejected": -1.82955801486969,
"step": 470
},
{
"epoch": 0.11516314779270634,
"grad_norm": 7.973906104493475,
"learning_rate": 4.996520667239582e-07,
"logits/chosen": -1.2711211442947388,
"logits/rejected": -1.1430588960647583,
"logps/chosen": -344.3868408203125,
"logps/rejected": -475.3819885253906,
"loss": 0.5449,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.9513166546821594,
"rewards/margins": 0.8125057220458984,
"rewards/rejected": -1.7638225555419922,
"step": 480
},
{
"epoch": 0.11756238003838772,
"grad_norm": 7.29273708493132,
"learning_rate": 4.995328825841939e-07,
"logits/chosen": -0.945563793182373,
"logits/rejected": -0.9637011289596558,
"logps/chosen": -333.838134765625,
"logps/rejected": -502.4449157714844,
"loss": 0.5398,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9119874238967896,
"rewards/margins": 1.7599906921386719,
"rewards/rejected": -2.671978235244751,
"step": 490
},
{
"epoch": 0.1199616122840691,
"grad_norm": 8.408190304146231,
"learning_rate": 4.993961947040967e-07,
"logits/chosen": -0.9354039430618286,
"logits/rejected": -1.008681297302246,
"logps/chosen": -389.75677490234375,
"logps/rejected": -416.6995544433594,
"loss": 0.5543,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0735552310943604,
"rewards/margins": 0.6031023263931274,
"rewards/rejected": -1.6766574382781982,
"step": 500
},
{
"epoch": 0.12236084452975048,
"grad_norm": 7.547972255202871,
"learning_rate": 4.992420126717784e-07,
"logits/chosen": -1.0642093420028687,
"logits/rejected": -1.0110970735549927,
"logps/chosen": -349.2254333496094,
"logps/rejected": -501.6767578125,
"loss": 0.5416,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.9027541875839233,
"rewards/margins": 1.5682001113891602,
"rewards/rejected": -2.470954418182373,
"step": 510
},
{
"epoch": 0.12476007677543186,
"grad_norm": 6.11542459949028,
"learning_rate": 4.990703473024958e-07,
"logits/chosen": -0.8702675104141235,
"logits/rejected": -1.03139066696167,
"logps/chosen": -410.02264404296875,
"logps/rejected": -504.4461364746094,
"loss": 0.5536,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.188328742980957,
"rewards/margins": 1.0218807458877563,
"rewards/rejected": -2.210209608078003,
"step": 520
},
{
"epoch": 0.12715930902111325,
"grad_norm": 9.452659503317662,
"learning_rate": 4.98881210637893e-07,
"logits/chosen": -1.1504271030426025,
"logits/rejected": -1.0904886722564697,
"logps/chosen": -299.77197265625,
"logps/rejected": -411.39520263671875,
"loss": 0.5477,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.7923839092254639,
"rewards/margins": 0.7800144553184509,
"rewards/rejected": -1.5723984241485596,
"step": 530
},
{
"epoch": 0.1295585412667946,
"grad_norm": 15.54174844903523,
"learning_rate": 4.986746159451553e-07,
"logits/chosen": -1.0087683200836182,
"logits/rejected": -1.0048372745513916,
"logps/chosen": -348.1635437011719,
"logps/rejected": -465.615234375,
"loss": 0.5681,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.949160099029541,
"rewards/margins": 1.2487725019454956,
"rewards/rejected": -2.197932720184326,
"step": 540
},
{
"epoch": 0.131957773512476,
"grad_norm": 6.811166436392921,
"learning_rate": 4.984505777160795e-07,
"logits/chosen": -0.8339638710021973,
"logits/rejected": -0.8557635545730591,
"logps/chosen": -368.79998779296875,
"logps/rejected": -455.84423828125,
"loss": 0.5638,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7133311629295349,
"rewards/margins": 0.8222800493240356,
"rewards/rejected": -1.5356113910675049,
"step": 550
},
{
"epoch": 0.1343570057581574,
"grad_norm": 8.38746715084373,
"learning_rate": 4.982091116660574e-07,
"logits/chosen": -0.9729937314987183,
"logits/rejected": -1.1065596342086792,
"logps/chosen": -269.25994873046875,
"logps/rejected": -294.3720703125,
"loss": 0.5681,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6644600629806519,
"rewards/margins": 0.42904800176620483,
"rewards/rejected": -1.0935081243515015,
"step": 560
},
{
"epoch": 0.13675623800383876,
"grad_norm": 24.438576955287925,
"learning_rate": 4.979502347329732e-07,
"logits/chosen": -0.8234192132949829,
"logits/rejected": -0.8349748849868774,
"logps/chosen": -419.22637939453125,
"logps/rejected": -574.6170654296875,
"loss": 0.5309,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4598208665847778,
"rewards/margins": 1.224372148513794,
"rewards/rejected": -2.6841928958892822,
"step": 570
},
{
"epoch": 0.13915547024952016,
"grad_norm": 8.10566134085291,
"learning_rate": 4.976739650760151e-07,
"logits/chosen": -1.0753071308135986,
"logits/rejected": -1.1201988458633423,
"logps/chosen": -420.6094665527344,
"logps/rejected": -514.7296142578125,
"loss": 0.5764,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6122499704360962,
"rewards/margins": 1.0379631519317627,
"rewards/rejected": -2.6502132415771484,
"step": 580
},
{
"epoch": 0.14155470249520152,
"grad_norm": 6.442606806291236,
"learning_rate": 4.97380322074402e-07,
"logits/chosen": -0.7010489702224731,
"logits/rejected": -0.7692248225212097,
"logps/chosen": -344.9544982910156,
"logps/rejected": -453.3834533691406,
"loss": 0.5796,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.21074640750885,
"rewards/margins": 1.0821633338928223,
"rewards/rejected": -2.292909860610962,
"step": 590
},
{
"epoch": 0.14395393474088292,
"grad_norm": 8.67810882207825,
"learning_rate": 4.970693263260237e-07,
"logits/chosen": -0.9984515905380249,
"logits/rejected": -1.0792922973632812,
"logps/chosen": -385.07855224609375,
"logps/rejected": -431.91748046875,
"loss": 0.5352,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.8377297520637512,
"rewards/margins": 0.9412263631820679,
"rewards/rejected": -1.7789561748504639,
"step": 600
},
{
"epoch": 0.1463531669865643,
"grad_norm": 19.243246370458866,
"learning_rate": 4.967409996459966e-07,
"logits/chosen": -0.9357202649116516,
"logits/rejected": -0.966041088104248,
"logps/chosen": -379.83709716796875,
"logps/rejected": -436.16021728515625,
"loss": 0.531,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2088489532470703,
"rewards/margins": 0.8029053807258606,
"rewards/rejected": -2.011754274368286,
"step": 610
},
{
"epoch": 0.14875239923224567,
"grad_norm": 15.946020238913784,
"learning_rate": 4.963953650651326e-07,
"logits/chosen": -0.836955726146698,
"logits/rejected": -0.9188618659973145,
"logps/chosen": -472.018798828125,
"logps/rejected": -462.2901306152344,
"loss": 0.5215,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2413372993469238,
"rewards/margins": 0.8713501691818237,
"rewards/rejected": -2.112687587738037,
"step": 620
},
{
"epoch": 0.15115163147792707,
"grad_norm": 7.979287445807055,
"learning_rate": 4.960324468283248e-07,
"logits/chosen": -1.00057053565979,
"logits/rejected": -1.0442100763320923,
"logps/chosen": -290.08868408203125,
"logps/rejected": -377.97027587890625,
"loss": 0.515,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.6758447885513306,
"rewards/margins": 0.8780001401901245,
"rewards/rejected": -1.5538448095321655,
"step": 630
},
{
"epoch": 0.15355086372360843,
"grad_norm": 10.945616218615863,
"learning_rate": 4.956522703928451e-07,
"logits/chosen": -0.978852391242981,
"logits/rejected": -0.8796356916427612,
"logps/chosen": -318.9521484375,
"logps/rejected": -465.2935485839844,
"loss": 0.5063,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8167131543159485,
"rewards/margins": 1.364931344985962,
"rewards/rejected": -2.1816444396972656,
"step": 640
},
{
"epoch": 0.15595009596928983,
"grad_norm": 12.631776767430628,
"learning_rate": 4.952548624265606e-07,
"logits/chosen": -0.9039742350578308,
"logits/rejected": -0.9406811594963074,
"logps/chosen": -403.57562255859375,
"logps/rejected": -478.9420471191406,
"loss": 0.5638,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3750524520874023,
"rewards/margins": 0.8406556248664856,
"rewards/rejected": -2.215708017349243,
"step": 650
},
{
"epoch": 0.15834932821497122,
"grad_norm": 6.1260020762155145,
"learning_rate": 4.948402508060607e-07,
"logits/chosen": -1.0212910175323486,
"logits/rejected": -1.0476784706115723,
"logps/chosen": -298.73028564453125,
"logps/rejected": -382.78021240234375,
"loss": 0.542,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.6404946446418762,
"rewards/margins": 0.9978824853897095,
"rewards/rejected": -1.6383771896362305,
"step": 660
},
{
"epoch": 0.16074856046065258,
"grad_norm": 9.288255831934693,
"learning_rate": 4.944084646147038e-07,
"logits/chosen": -0.869672417640686,
"logits/rejected": -0.9254360198974609,
"logps/chosen": -365.64019775390625,
"logps/rejected": -378.85113525390625,
"loss": 0.5793,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.5646086931228638,
"rewards/margins": 0.3554513156414032,
"rewards/rejected": -0.9200600385665894,
"step": 670
},
{
"epoch": 0.16314779270633398,
"grad_norm": 12.016810333669639,
"learning_rate": 4.939595341405754e-07,
"logits/chosen": -0.8810294270515442,
"logits/rejected": -0.9110749363899231,
"logps/chosen": -320.2789001464844,
"logps/rejected": -362.8446960449219,
"loss": 0.5238,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.6605560183525085,
"rewards/margins": 0.6321157217025757,
"rewards/rejected": -1.2926716804504395,
"step": 680
},
{
"epoch": 0.16554702495201534,
"grad_norm": 9.358664871218739,
"learning_rate": 4.93493490874365e-07,
"logits/chosen": -0.9154292941093445,
"logits/rejected": -0.9222054481506348,
"logps/chosen": -362.84686279296875,
"logps/rejected": -441.8236389160156,
"loss": 0.5367,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.235999345779419,
"rewards/margins": 0.7342005968093872,
"rewards/rejected": -1.9701995849609375,
"step": 690
},
{
"epoch": 0.16794625719769674,
"grad_norm": 9.152240736767983,
"learning_rate": 4.93010367507156e-07,
"logits/chosen": -1.0514498949050903,
"logits/rejected": -1.0271055698394775,
"logps/chosen": -301.6610107421875,
"logps/rejected": -385.259521484375,
"loss": 0.5134,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.8431800603866577,
"rewards/margins": 1.0639536380767822,
"rewards/rejected": -1.90713369846344,
"step": 700
},
{
"epoch": 0.17034548944337813,
"grad_norm": 12.567284708252304,
"learning_rate": 4.925101979281332e-07,
"logits/chosen": -0.9216286540031433,
"logits/rejected": -1.0931814908981323,
"logps/chosen": -433.27130126953125,
"logps/rejected": -560.4276123046875,
"loss": 0.5031,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.2827928066253662,
"rewards/margins": 1.7925609350204468,
"rewards/rejected": -3.0753538608551025,
"step": 710
},
{
"epoch": 0.1727447216890595,
"grad_norm": 11.741747616486963,
"learning_rate": 4.919930172222054e-07,
"logits/chosen": -0.9665408134460449,
"logits/rejected": -1.0536162853240967,
"logps/chosen": -417.403076171875,
"logps/rejected": -570.5277099609375,
"loss": 0.5003,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7012180089950562,
"rewards/margins": 1.562795877456665,
"rewards/rejected": -3.2640137672424316,
"step": 720
},
{
"epoch": 0.1751439539347409,
"grad_norm": 11.733941979044587,
"learning_rate": 4.914588616675445e-07,
"logits/chosen": -1.0246481895446777,
"logits/rejected": -1.0158023834228516,
"logps/chosen": -348.80133056640625,
"logps/rejected": -439.29608154296875,
"loss": 0.5572,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.0424585342407227,
"rewards/margins": 1.09738290309906,
"rewards/rejected": -2.1398415565490723,
"step": 730
},
{
"epoch": 0.17754318618042225,
"grad_norm": 8.84337454037855,
"learning_rate": 4.909077687330404e-07,
"logits/chosen": -0.8301714658737183,
"logits/rejected": -0.9135535359382629,
"logps/chosen": -330.27264404296875,
"logps/rejected": -350.2643127441406,
"loss": 0.5051,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6734127402305603,
"rewards/margins": 0.41529732942581177,
"rewards/rejected": -1.088710069656372,
"step": 740
},
{
"epoch": 0.17994241842610365,
"grad_norm": 8.880233466088285,
"learning_rate": 4.903397770756729e-07,
"logits/chosen": -1.0296955108642578,
"logits/rejected": -1.0950233936309814,
"logps/chosen": -355.98968505859375,
"logps/rejected": -452.5968322753906,
"loss": 0.5248,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9854777455329895,
"rewards/margins": 0.9587169885635376,
"rewards/rejected": -1.9441944360733032,
"step": 750
},
{
"epoch": 0.18234165067178504,
"grad_norm": 10.737134261298277,
"learning_rate": 4.897549265378004e-07,
"logits/chosen": -0.9651594161987305,
"logits/rejected": -1.0333675146102905,
"logps/chosen": -473.6534729003906,
"logps/rejected": -615.0003662109375,
"loss": 0.4966,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.580303430557251,
"rewards/margins": 1.4406054019927979,
"rewards/rejected": -3.020908832550049,
"step": 760
},
{
"epoch": 0.1847408829174664,
"grad_norm": 10.188381001048064,
"learning_rate": 4.891532581443643e-07,
"logits/chosen": -1.1801836490631104,
"logits/rejected": -1.2248878479003906,
"logps/chosen": -438.201171875,
"logps/rejected": -577.3571166992188,
"loss": 0.4856,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.3580670356750488,
"rewards/margins": 1.4186818599700928,
"rewards/rejected": -2.7767486572265625,
"step": 770
},
{
"epoch": 0.1871401151631478,
"grad_norm": 13.185729895925714,
"learning_rate": 4.885348141000122e-07,
"logits/chosen": -1.06368887424469,
"logits/rejected": -1.0469920635223389,
"logps/chosen": -363.1687316894531,
"logps/rejected": -502.82684326171875,
"loss": 0.4839,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2150341272354126,
"rewards/margins": 1.3010752201080322,
"rewards/rejected": -2.5161094665527344,
"step": 780
},
{
"epoch": 0.18953934740882916,
"grad_norm": 13.52232678239933,
"learning_rate": 4.878996377861367e-07,
"logits/chosen": -1.0281708240509033,
"logits/rejected": -1.094001054763794,
"logps/chosen": -321.34814453125,
"logps/rejected": -453.59417724609375,
"loss": 0.5211,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.0169535875320435,
"rewards/margins": 1.2799100875854492,
"rewards/rejected": -2.2968640327453613,
"step": 790
},
{
"epoch": 0.19193857965451055,
"grad_norm": 8.497366103850682,
"learning_rate": 4.872477737578327e-07,
"logits/chosen": -1.005568504333496,
"logits/rejected": -0.9401241540908813,
"logps/chosen": -398.41546630859375,
"logps/rejected": -612.4130249023438,
"loss": 0.4468,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.1582279205322266,
"rewards/margins": 2.2199604511260986,
"rewards/rejected": -3.378188371658325,
"step": 800
},
{
"epoch": 0.19433781190019195,
"grad_norm": 17.573875043998388,
"learning_rate": 4.865792677407718e-07,
"logits/chosen": -1.0809205770492554,
"logits/rejected": -1.1499931812286377,
"logps/chosen": -387.76202392578125,
"logps/rejected": -490.18243408203125,
"loss": 0.5666,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.46564781665802,
"rewards/margins": 1.2134864330291748,
"rewards/rejected": -2.6791341304779053,
"step": 810
},
{
"epoch": 0.1967370441458733,
"grad_norm": 8.204522849107846,
"learning_rate": 4.858941666279955e-07,
"logits/chosen": -0.8947283029556274,
"logits/rejected": -0.9740638732910156,
"logps/chosen": -393.2839660644531,
"logps/rejected": -441.0948791503906,
"loss": 0.5435,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2646175622940063,
"rewards/margins": 0.5960845947265625,
"rewards/rejected": -1.8607019186019897,
"step": 820
},
{
"epoch": 0.1991362763915547,
"grad_norm": 10.142292221516385,
"learning_rate": 4.851925184766247e-07,
"logits/chosen": -1.053379774093628,
"logits/rejected": -1.1101844310760498,
"logps/chosen": -351.99090576171875,
"logps/rejected": -449.01751708984375,
"loss": 0.4937,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.9777849316596985,
"rewards/margins": 1.14426851272583,
"rewards/rejected": -2.122053623199463,
"step": 830
},
{
"epoch": 0.20153550863723607,
"grad_norm": 12.034416836065418,
"learning_rate": 4.844743725044897e-07,
"logits/chosen": -1.0946062803268433,
"logits/rejected": -1.2976312637329102,
"logps/chosen": -378.5492248535156,
"logps/rejected": -474.7021484375,
"loss": 0.5084,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2278741598129272,
"rewards/margins": 1.197396159172058,
"rewards/rejected": -2.4252700805664062,
"step": 840
},
{
"epoch": 0.20393474088291746,
"grad_norm": 8.290745524509466,
"learning_rate": 4.837397790866774e-07,
"logits/chosen": -1.2077043056488037,
"logits/rejected": -1.2064851522445679,
"logps/chosen": -398.6294250488281,
"logps/rejected": -532.10009765625,
"loss": 0.5617,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.0644519329071045,
"rewards/margins": 1.5777161121368408,
"rewards/rejected": -2.642167806625366,
"step": 850
},
{
"epoch": 0.20633397312859886,
"grad_norm": 9.121636380617016,
"learning_rate": 4.829887897519974e-07,
"logits/chosen": -1.2348374128341675,
"logits/rejected": -1.2065662145614624,
"logps/chosen": -323.52520751953125,
"logps/rejected": -456.983642578125,
"loss": 0.5042,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.9337062835693359,
"rewards/margins": 1.160902500152588,
"rewards/rejected": -2.094609022140503,
"step": 860
},
{
"epoch": 0.20873320537428022,
"grad_norm": 10.124524555819926,
"learning_rate": 4.82221457179368e-07,
"logits/chosen": -1.2232173681259155,
"logits/rejected": -1.2059452533721924,
"logps/chosen": -376.4814147949219,
"logps/rejected": -523.2281494140625,
"loss": 0.4462,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.0080255270004272,
"rewards/margins": 1.6320230960845947,
"rewards/rejected": -2.6400485038757324,
"step": 870
},
{
"epoch": 0.21113243761996162,
"grad_norm": 15.383829882801075,
"learning_rate": 4.814378351941206e-07,
"logits/chosen": -1.1030246019363403,
"logits/rejected": -1.165531873703003,
"logps/chosen": -376.2377014160156,
"logps/rejected": -447.36871337890625,
"loss": 0.5179,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.1712180376052856,
"rewards/margins": 0.8365498781204224,
"rewards/rejected": -2.007767915725708,
"step": 880
},
{
"epoch": 0.21353166986564298,
"grad_norm": 8.924221297687437,
"learning_rate": 4.806379787642241e-07,
"logits/chosen": -1.13001549243927,
"logits/rejected": -1.117497205734253,
"logps/chosen": -358.25286865234375,
"logps/rejected": -490.5638732910156,
"loss": 0.5141,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0755687952041626,
"rewards/margins": 1.3798635005950928,
"rewards/rejected": -2.4554319381713867,
"step": 890
},
{
"epoch": 0.21593090211132437,
"grad_norm": 9.091481860281236,
"learning_rate": 4.798219439964293e-07,
"logits/chosen": -1.1416652202606201,
"logits/rejected": -1.2265089750289917,
"logps/chosen": -366.56915283203125,
"logps/rejected": -414.20721435546875,
"loss": 0.476,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.119057059288025,
"rewards/margins": 0.3352181911468506,
"rewards/rejected": -1.454275131225586,
"step": 900
},
{
"epoch": 0.21833013435700577,
"grad_norm": 17.10074312802646,
"learning_rate": 4.78989788132333e-07,
"logits/chosen": -1.1152770519256592,
"logits/rejected": -1.1071698665618896,
"logps/chosen": -342.0806884765625,
"logps/rejected": -534.8381958007812,
"loss": 0.4536,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.19199538230896,
"rewards/margins": 1.8661339282989502,
"rewards/rejected": -3.0581297874450684,
"step": 910
},
{
"epoch": 0.22072936660268713,
"grad_norm": 9.85354436821844,
"learning_rate": 4.781415695443631e-07,
"logits/chosen": -1.1570137739181519,
"logits/rejected": -1.1956650018692017,
"logps/chosen": -509.591552734375,
"logps/rejected": -693.6530151367188,
"loss": 0.5105,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.156250476837158,
"rewards/margins": 1.8081929683685303,
"rewards/rejected": -3.9644439220428467,
"step": 920
},
{
"epoch": 0.22312859884836853,
"grad_norm": 10.93257575281564,
"learning_rate": 4.772773477316836e-07,
"logits/chosen": -1.0778967142105103,
"logits/rejected": -1.133569598197937,
"logps/chosen": -377.20037841796875,
"logps/rejected": -475.40997314453125,
"loss": 0.4849,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.1448915004730225,
"rewards/margins": 0.9261860847473145,
"rewards/rejected": -2.071077823638916,
"step": 930
},
{
"epoch": 0.2255278310940499,
"grad_norm": 13.493444996052222,
"learning_rate": 4.7639718331602117e-07,
"logits/chosen": -1.0905169248580933,
"logits/rejected": -1.1160600185394287,
"logps/chosen": -434.937744140625,
"logps/rejected": -650.6595458984375,
"loss": 0.501,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.5431007146835327,
"rewards/margins": 2.3286290168762207,
"rewards/rejected": -3.871730327606201,
"step": 940
},
{
"epoch": 0.22792706333973128,
"grad_norm": 17.479671128658037,
"learning_rate": 4.7550113803741275e-07,
"logits/chosen": -1.1507641077041626,
"logits/rejected": -1.2998677492141724,
"logps/chosen": -448.26251220703125,
"logps/rejected": -497.29779052734375,
"loss": 0.4886,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5616766214370728,
"rewards/margins": 1.1937475204467773,
"rewards/rejected": -2.7554240226745605,
"step": 950
},
{
"epoch": 0.23032629558541268,
"grad_norm": 17.248167606427355,
"learning_rate": 4.7458927474987454e-07,
"logits/chosen": -1.0716644525527954,
"logits/rejected": -1.1447921991348267,
"logps/chosen": -426.6358947753906,
"logps/rejected": -470.9082946777344,
"loss": 0.4858,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1936933994293213,
"rewards/margins": 1.0052495002746582,
"rewards/rejected": -2.1989428997039795,
"step": 960
},
{
"epoch": 0.23272552783109404,
"grad_norm": 14.729008337765277,
"learning_rate": 4.7366165741699347e-07,
"logits/chosen": -0.9885573387145996,
"logits/rejected": -1.0534632205963135,
"logps/chosen": -458.31378173828125,
"logps/rejected": -540.1591186523438,
"loss": 0.4747,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.3210374116897583,
"rewards/margins": 1.1498852968215942,
"rewards/rejected": -2.4709229469299316,
"step": 970
},
{
"epoch": 0.23512476007677544,
"grad_norm": 15.150583413082405,
"learning_rate": 4.727183511074401e-07,
"logits/chosen": -1.309410810470581,
"logits/rejected": -1.3342196941375732,
"logps/chosen": -416.07293701171875,
"logps/rejected": -466.21923828125,
"loss": 0.4945,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.2828443050384521,
"rewards/margins": 0.6490219235420227,
"rewards/rejected": -1.9318662881851196,
"step": 980
},
{
"epoch": 0.2375239923224568,
"grad_norm": 11.648714128278801,
"learning_rate": 4.717594219904043e-07,
"logits/chosen": -1.0633580684661865,
"logits/rejected": -1.179321527481079,
"logps/chosen": -393.45648193359375,
"logps/rejected": -493.12603759765625,
"loss": 0.5057,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.2883937358856201,
"rewards/margins": 1.364997148513794,
"rewards/rejected": -2.653390884399414,
"step": 990
},
{
"epoch": 0.2399232245681382,
"grad_norm": 10.483921422479957,
"learning_rate": 4.7078493733095393e-07,
"logits/chosen": -1.0601375102996826,
"logits/rejected": -1.1300022602081299,
"logps/chosen": -432.314208984375,
"logps/rejected": -605.5133056640625,
"loss": 0.4784,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.7640736103057861,
"rewards/margins": 1.7046064138412476,
"rewards/rejected": -3.4686806201934814,
"step": 1000
},
{
"epoch": 0.2423224568138196,
"grad_norm": 17.060662316036694,
"learning_rate": 4.6979496548531614e-07,
"logits/chosen": -1.243939757347107,
"logits/rejected": -1.223625898361206,
"logps/chosen": -446.4366760253906,
"logps/rejected": -638.4450073242188,
"loss": 0.5126,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8022911548614502,
"rewards/margins": 1.4770129919052124,
"rewards/rejected": -3.279303789138794,
"step": 1010
},
{
"epoch": 0.24472168905950095,
"grad_norm": 12.903823112622515,
"learning_rate": 4.6878957589608293e-07,
"logits/chosen": -1.1194379329681396,
"logits/rejected": -1.0698894262313843,
"logps/chosen": -409.68603515625,
"logps/rejected": -603.4207763671875,
"loss": 0.5214,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.4294629096984863,
"rewards/margins": 1.5245181322097778,
"rewards/rejected": -2.9539809226989746,
"step": 1020
},
{
"epoch": 0.24712092130518235,
"grad_norm": 10.384415695069938,
"learning_rate": 4.6776883908733956e-07,
"logits/chosen": -1.1999337673187256,
"logits/rejected": -1.311231255531311,
"logps/chosen": -397.4037780761719,
"logps/rejected": -460.95269775390625,
"loss": 0.4774,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.9981710314750671,
"rewards/margins": 1.341308832168579,
"rewards/rejected": -2.339479684829712,
"step": 1030
},
{
"epoch": 0.2495201535508637,
"grad_norm": 14.761052654024631,
"learning_rate": 4.667328266597178e-07,
"logits/chosen": -1.1137980222702026,
"logits/rejected": -1.1541672945022583,
"logps/chosen": -391.23504638671875,
"logps/rejected": -516.2595825195312,
"loss": 0.4633,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3899133205413818,
"rewards/margins": 1.2868678569793701,
"rewards/rejected": -2.676781177520752,
"step": 1040
},
{
"epoch": 0.2519193857965451,
"grad_norm": 10.417201545289524,
"learning_rate": 4.6568161128537354e-07,
"logits/chosen": -1.0899343490600586,
"logits/rejected": -1.247287631034851,
"logps/chosen": -420.63836669921875,
"logps/rejected": -514.6077270507812,
"loss": 0.4964,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.5773645639419556,
"rewards/margins": 1.506489872932434,
"rewards/rejected": -3.0838541984558105,
"step": 1050
},
{
"epoch": 0.2543186180422265,
"grad_norm": 16.488420937432345,
"learning_rate": 4.6461526670288877e-07,
"logits/chosen": -1.1203404664993286,
"logits/rejected": -1.1449543237686157,
"logps/chosen": -407.5829162597656,
"logps/rejected": -497.4512634277344,
"loss": 0.493,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3892755508422852,
"rewards/margins": 1.1714465618133545,
"rewards/rejected": -2.5607221126556396,
"step": 1060
},
{
"epoch": 0.2567178502879079,
"grad_norm": 9.466776525081485,
"learning_rate": 4.635338677120994e-07,
"logits/chosen": -1.3948705196380615,
"logits/rejected": -1.3905651569366455,
"logps/chosen": -379.10009765625,
"logps/rejected": -558.11328125,
"loss": 0.4566,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.116693139076233,
"rewards/margins": 1.6451349258422852,
"rewards/rejected": -2.7618279457092285,
"step": 1070
},
{
"epoch": 0.2591170825335892,
"grad_norm": 14.2505130371337,
"learning_rate": 4.6243749016884835e-07,
"logits/chosen": -1.1959508657455444,
"logits/rejected": -1.2822418212890625,
"logps/chosen": -459.30999755859375,
"logps/rejected": -783.4131469726562,
"loss": 0.5047,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.8917354345321655,
"rewards/margins": 2.781670331954956,
"rewards/rejected": -4.673405647277832,
"step": 1080
},
{
"epoch": 0.2615163147792706,
"grad_norm": 16.476931783493725,
"learning_rate": 4.613262109796645e-07,
"logits/chosen": -1.2336928844451904,
"logits/rejected": -1.1787619590759277,
"logps/chosen": -439.15838623046875,
"logps/rejected": -729.7471313476562,
"loss": 0.4727,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.9186598062515259,
"rewards/margins": 2.601553440093994,
"rewards/rejected": -4.5202131271362305,
"step": 1090
},
{
"epoch": 0.263915547024952,
"grad_norm": 12.359629943883897,
"learning_rate": 4.602001080963678e-07,
"logits/chosen": -1.1227662563323975,
"logits/rejected": -1.1912363767623901,
"logps/chosen": -392.65057373046875,
"logps/rejected": -619.9832763671875,
"loss": 0.4556,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.2552802562713623,
"rewards/margins": 2.460552215576172,
"rewards/rejected": -3.715832233428955,
"step": 1100
},
{
"epoch": 0.2663147792706334,
"grad_norm": 19.000478036791087,
"learning_rate": 4.590592605106017e-07,
"logits/chosen": -1.1630818843841553,
"logits/rejected": -1.2058923244476318,
"logps/chosen": -423.0645446777344,
"logps/rejected": -619.4017333984375,
"loss": 0.5008,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.3737356662750244,
"rewards/margins": 2.270371198654175,
"rewards/rejected": -3.6441073417663574,
"step": 1110
},
{
"epoch": 0.2687140115163148,
"grad_norm": 11.988638837694287,
"learning_rate": 4.5790374824829165e-07,
"logits/chosen": -1.1366350650787354,
"logits/rejected": -1.2177644968032837,
"logps/chosen": -313.95233154296875,
"logps/rejected": -553.4268798828125,
"loss": 0.5165,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.2247995138168335,
"rewards/margins": 2.2755985260009766,
"rewards/rejected": -3.5003979206085205,
"step": 1120
},
{
"epoch": 0.27111324376199614,
"grad_norm": 13.389751916846311,
"learning_rate": 4.5673365236403216e-07,
"logits/chosen": -1.1376798152923584,
"logits/rejected": -1.2225024700164795,
"logps/chosen": -418.84930419921875,
"logps/rejected": -644.3150024414062,
"loss": 0.4922,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.1243631839752197,
"rewards/margins": 1.9992843866348267,
"rewards/rejected": -4.123647212982178,
"step": 1130
},
{
"epoch": 0.27351247600767753,
"grad_norm": 12.066478642007997,
"learning_rate": 4.5554905493540075e-07,
"logits/chosen": -1.393936038017273,
"logits/rejected": -1.4359791278839111,
"logps/chosen": -366.07171630859375,
"logps/rejected": -646.8983154296875,
"loss": 0.4248,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.4785449504852295,
"rewards/margins": 2.75530743598938,
"rewards/rejected": -4.233852386474609,
"step": 1140
},
{
"epoch": 0.2759117082533589,
"grad_norm": 13.594931228595337,
"learning_rate": 4.5435003905720074e-07,
"logits/chosen": -1.3401994705200195,
"logits/rejected": -1.4189189672470093,
"logps/chosen": -555.1502685546875,
"logps/rejected": -733.55908203125,
"loss": 0.4941,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.782365560531616,
"rewards/margins": 2.0489964485168457,
"rewards/rejected": -4.831361770629883,
"step": 1150
},
{
"epoch": 0.2783109404990403,
"grad_norm": 10.959828441128586,
"learning_rate": 4.531366888356324e-07,
"logits/chosen": -1.3572887182235718,
"logits/rejected": -1.316543459892273,
"logps/chosen": -379.75048828125,
"logps/rejected": -709.0888061523438,
"loss": 0.421,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.861644983291626,
"rewards/margins": 2.909771680831909,
"rewards/rejected": -4.771416664123535,
"step": 1160
},
{
"epoch": 0.2807101727447217,
"grad_norm": 11.9806643935228,
"learning_rate": 4.519090893823931e-07,
"logits/chosen": -1.2535573244094849,
"logits/rejected": -1.2999234199523926,
"logps/chosen": -466.22979736328125,
"logps/rejected": -587.1083984375,
"loss": 0.4706,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.177558422088623,
"rewards/margins": 1.3151285648345947,
"rewards/rejected": -3.4926867485046387,
"step": 1170
},
{
"epoch": 0.28310940499040305,
"grad_norm": 15.64676518237564,
"learning_rate": 4.5066732680870734e-07,
"logits/chosen": -1.1491421461105347,
"logits/rejected": -1.2824984788894653,
"logps/chosen": -410.51129150390625,
"logps/rejected": -574.32861328125,
"loss": 0.4552,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5698862075805664,
"rewards/margins": 2.0116872787475586,
"rewards/rejected": -3.581573486328125,
"step": 1180
},
{
"epoch": 0.28550863723608444,
"grad_norm": 14.866463710844934,
"learning_rate": 4.494114882192862e-07,
"logits/chosen": -1.1980191469192505,
"logits/rejected": -1.2435463666915894,
"logps/chosen": -425.57733154296875,
"logps/rejected": -676.6328735351562,
"loss": 0.456,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.72724187374115,
"rewards/margins": 2.7374391555786133,
"rewards/rejected": -4.4646806716918945,
"step": 1190
},
{
"epoch": 0.28790786948176583,
"grad_norm": 13.193324388633975,
"learning_rate": 4.4814166170621735e-07,
"logits/chosen": -1.4249297380447388,
"logits/rejected": -1.497604489326477,
"logps/chosen": -480.2257385253906,
"logps/rejected": -622.1727294921875,
"loss": 0.4656,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.2823410034179688,
"rewards/margins": 1.655609130859375,
"rewards/rejected": -3.9379496574401855,
"step": 1200
},
{
"epoch": 0.2903071017274472,
"grad_norm": 20.768443614327058,
"learning_rate": 4.468579363427858e-07,
"logits/chosen": -1.2781771421432495,
"logits/rejected": -1.3352419137954712,
"logps/chosen": -420.2730407714844,
"logps/rejected": -640.3319091796875,
"loss": 0.4615,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5896456241607666,
"rewards/margins": 2.4936366081237793,
"rewards/rejected": -4.083281517028809,
"step": 1210
},
{
"epoch": 0.2927063339731286,
"grad_norm": 9.971205156379035,
"learning_rate": 4.4556040217722555e-07,
"logits/chosen": -1.236127257347107,
"logits/rejected": -1.2078077793121338,
"logps/chosen": -356.2068786621094,
"logps/rejected": -533.8853759765625,
"loss": 0.4674,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.1397576332092285,
"rewards/margins": 1.5811337232589722,
"rewards/rejected": -2.7208914756774902,
"step": 1220
},
{
"epoch": 0.29510556621880996,
"grad_norm": 12.163666083646278,
"learning_rate": 4.442491502264033e-07,
"logits/chosen": -1.1941020488739014,
"logits/rejected": -1.2168632745742798,
"logps/chosen": -363.7603454589844,
"logps/rejected": -455.57867431640625,
"loss": 0.4612,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.3447378873825073,
"rewards/margins": 1.0612128973007202,
"rewards/rejected": -2.4059505462646484,
"step": 1230
},
{
"epoch": 0.29750479846449135,
"grad_norm": 9.273351878722064,
"learning_rate": 4.429242724694338e-07,
"logits/chosen": -1.3338253498077393,
"logits/rejected": -1.336753010749817,
"logps/chosen": -395.6485595703125,
"logps/rejected": -632.2241821289062,
"loss": 0.454,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.4914841651916504,
"rewards/margins": 2.230437994003296,
"rewards/rejected": -3.7219223976135254,
"step": 1240
},
{
"epoch": 0.29990403071017274,
"grad_norm": 25.05777651266455,
"learning_rate": 4.4158586184122817e-07,
"logits/chosen": -1.1749566793441772,
"logits/rejected": -1.271209478378296,
"logps/chosen": -429.14306640625,
"logps/rejected": -604.9749755859375,
"loss": 0.4598,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.3869397640228271,
"rewards/margins": 2.094210386276245,
"rewards/rejected": -3.4811503887176514,
"step": 1250
},
{
"epoch": 0.30230326295585414,
"grad_norm": 17.369219336254258,
"learning_rate": 4.4023401222597443e-07,
"logits/chosen": -1.027753233909607,
"logits/rejected": -1.1785060167312622,
"logps/chosen": -425.7173767089844,
"logps/rejected": -544.3839721679688,
"loss": 0.4667,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5711647272109985,
"rewards/margins": 1.3786556720733643,
"rewards/rejected": -2.9498205184936523,
"step": 1260
},
{
"epoch": 0.30470249520153553,
"grad_norm": 11.861504310327616,
"learning_rate": 4.3886881845055235e-07,
"logits/chosen": -1.1897116899490356,
"logits/rejected": -1.299993872642517,
"logps/chosen": -375.5953674316406,
"logps/rejected": -659.8858032226562,
"loss": 0.4618,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.3118809461593628,
"rewards/margins": 2.890842914581299,
"rewards/rejected": -4.202723979949951,
"step": 1270
},
{
"epoch": 0.30710172744721687,
"grad_norm": 10.8218616423801,
"learning_rate": 4.374903762778814e-07,
"logits/chosen": -1.3571842908859253,
"logits/rejected": -1.385545253753662,
"logps/chosen": -494.83709716796875,
"logps/rejected": -658.3067016601562,
"loss": 0.4837,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.376838445663452,
"rewards/margins": 1.8850828409194946,
"rewards/rejected": -4.261920928955078,
"step": 1280
},
{
"epoch": 0.30950095969289826,
"grad_norm": 10.14042165637657,
"learning_rate": 4.3609878240020356e-07,
"logits/chosen": -1.2036101818084717,
"logits/rejected": -1.3167588710784912,
"logps/chosen": -486.25811767578125,
"logps/rejected": -651.5155639648438,
"loss": 0.4531,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9182531833648682,
"rewards/margins": 2.2668986320495605,
"rewards/rejected": -4.18515157699585,
"step": 1290
},
{
"epoch": 0.31190019193857965,
"grad_norm": 11.299969604518076,
"learning_rate": 4.346941344323005e-07,
"logits/chosen": -1.376947045326233,
"logits/rejected": -1.4665632247924805,
"logps/chosen": -451.6893615722656,
"logps/rejected": -499.0320739746094,
"loss": 0.4913,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.929490089416504,
"rewards/margins": 1.0548813343048096,
"rewards/rejected": -2.9843716621398926,
"step": 1300
},
{
"epoch": 0.31429942418426104,
"grad_norm": 11.980383531649544,
"learning_rate": 4.332765309046467e-07,
"logits/chosen": -1.332617998123169,
"logits/rejected": -1.3639962673187256,
"logps/chosen": -426.499755859375,
"logps/rejected": -626.6729736328125,
"loss": 0.474,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.678342580795288,
"rewards/margins": 2.378535747528076,
"rewards/rejected": -4.056878566741943,
"step": 1310
},
{
"epoch": 0.31669865642994244,
"grad_norm": 14.013248425816212,
"learning_rate": 4.3184607125649754e-07,
"logits/chosen": -1.2517986297607422,
"logits/rejected": -1.3113311529159546,
"logps/chosen": -414.6436462402344,
"logps/rejected": -657.0100708007812,
"loss": 0.4847,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.3406541347503662,
"rewards/margins": 2.270404815673828,
"rewards/rejected": -3.6110591888427734,
"step": 1320
},
{
"epoch": 0.3190978886756238,
"grad_norm": 15.333659280580797,
"learning_rate": 4.304028558289141e-07,
"logits/chosen": -1.4721230268478394,
"logits/rejected": -1.506519079208374,
"logps/chosen": -451.8147888183594,
"logps/rejected": -667.3796997070312,
"loss": 0.448,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6051162481307983,
"rewards/margins": 2.3636975288391113,
"rewards/rejected": -3.96881365776062,
"step": 1330
},
{
"epoch": 0.32149712092130517,
"grad_norm": 12.587601683578118,
"learning_rate": 4.28946985857725e-07,
"logits/chosen": -1.5413029193878174,
"logits/rejected": -1.5675899982452393,
"logps/chosen": -508.3623046875,
"logps/rejected": -787.8271484375,
"loss": 0.4284,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.329979658126831,
"rewards/margins": 2.8664588928222656,
"rewards/rejected": -5.196438789367676,
"step": 1340
},
{
"epoch": 0.32389635316698656,
"grad_norm": 11.206457061422094,
"learning_rate": 4.2747856346642445e-07,
"logits/chosen": -1.1610701084136963,
"logits/rejected": -1.1690763235092163,
"logps/chosen": -401.5397033691406,
"logps/rejected": -605.7808227539062,
"loss": 0.4051,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.7380218505859375,
"rewards/margins": 2.2142810821533203,
"rewards/rejected": -3.952302932739258,
"step": 1350
},
{
"epoch": 0.32629558541266795,
"grad_norm": 20.124180714207323,
"learning_rate": 4.2599769165900933e-07,
"logits/chosen": -1.1436104774475098,
"logits/rejected": -1.193645715713501,
"logps/chosen": -501.67706298828125,
"logps/rejected": -850.97314453125,
"loss": 0.4949,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.594907283782959,
"rewards/margins": 3.6968417167663574,
"rewards/rejected": -6.291749000549316,
"step": 1360
},
{
"epoch": 0.32869481765834935,
"grad_norm": 8.824763672957205,
"learning_rate": 4.245044743127535e-07,
"logits/chosen": -1.2460205554962158,
"logits/rejected": -1.1843246221542358,
"logps/chosen": -402.1271667480469,
"logps/rejected": -616.098388671875,
"loss": 0.4785,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.439296007156372,
"rewards/margins": 2.2133936882019043,
"rewards/rejected": -3.6526896953582764,
"step": 1370
},
{
"epoch": 0.3310940499040307,
"grad_norm": 14.598877417461923,
"learning_rate": 4.229990161709214e-07,
"logits/chosen": -1.2217421531677246,
"logits/rejected": -1.130197286605835,
"logps/chosen": -352.7733459472656,
"logps/rejected": -632.5406494140625,
"loss": 0.463,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.2021641731262207,
"rewards/margins": 2.610610246658325,
"rewards/rejected": -3.8127739429473877,
"step": 1380
},
{
"epoch": 0.3334932821497121,
"grad_norm": 11.865002342507843,
"learning_rate": 4.214814228354204e-07,
"logits/chosen": -1.342158555984497,
"logits/rejected": -1.390978217124939,
"logps/chosen": -457.16705322265625,
"logps/rejected": -833.06787109375,
"loss": 0.4364,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.8238433599472046,
"rewards/margins": 3.916942596435547,
"rewards/rejected": -5.740786075592041,
"step": 1390
},
{
"epoch": 0.33589251439539347,
"grad_norm": 12.72331809654516,
"learning_rate": 4.1995180075939375e-07,
"logits/chosen": -1.4785504341125488,
"logits/rejected": -1.4822914600372314,
"logps/chosen": -445.6946716308594,
"logps/rejected": -639.58837890625,
"loss": 0.4643,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7562789916992188,
"rewards/margins": 2.1691954135894775,
"rewards/rejected": -3.925474166870117,
"step": 1400
},
{
"epoch": 0.33829174664107486,
"grad_norm": 11.703724239093889,
"learning_rate": 4.1841025723975297e-07,
"logits/chosen": -1.1503616571426392,
"logits/rejected": -1.1993227005004883,
"logps/chosen": -395.007080078125,
"logps/rejected": -638.5355224609375,
"loss": 0.4344,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.1847431659698486,
"rewards/margins": 2.6262497901916504,
"rewards/rejected": -3.810993194580078,
"step": 1410
},
{
"epoch": 0.34069097888675626,
"grad_norm": 19.639469762609966,
"learning_rate": 4.168569004096516e-07,
"logits/chosen": -1.2111709117889404,
"logits/rejected": -1.1946176290512085,
"logps/chosen": -405.03558349609375,
"logps/rejected": -638.8396606445312,
"loss": 0.4405,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.7199838161468506,
"rewards/margins": 2.166039228439331,
"rewards/rejected": -3.886023759841919,
"step": 1420
},
{
"epoch": 0.3430902111324376,
"grad_norm": 12.824816837365054,
"learning_rate": 4.152918392308997e-07,
"logits/chosen": -1.4239578247070312,
"logits/rejected": -1.4245679378509521,
"logps/chosen": -430.7586364746094,
"logps/rejected": -618.7017822265625,
"loss": 0.4354,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.7669651508331299,
"rewards/margins": 1.9917770624160767,
"rewards/rejected": -3.758742094039917,
"step": 1430
},
{
"epoch": 0.345489443378119,
"grad_norm": 17.48149790816653,
"learning_rate": 4.137151834863213e-07,
"logits/chosen": -1.307308554649353,
"logits/rejected": -1.2468369007110596,
"logps/chosen": -443.11370849609375,
"logps/rejected": -816.6782836914062,
"loss": 0.5016,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.0893607139587402,
"rewards/margins": 3.403632640838623,
"rewards/rejected": -5.492993354797363,
"step": 1440
},
{
"epoch": 0.3478886756238004,
"grad_norm": 13.277912286181408,
"learning_rate": 4.121270437720526e-07,
"logits/chosen": -1.192663550376892,
"logits/rejected": -1.184259295463562,
"logps/chosen": -402.0312194824219,
"logps/rejected": -540.152587890625,
"loss": 0.4536,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.9467941522598267,
"rewards/margins": 0.9955675005912781,
"rewards/rejected": -2.942361354827881,
"step": 1450
},
{
"epoch": 0.3502879078694818,
"grad_norm": 12.024084039884944,
"learning_rate": 4.105275314897852e-07,
"logits/chosen": -1.3382481336593628,
"logits/rejected": -1.3211191892623901,
"logps/chosen": -403.6156005859375,
"logps/rejected": -820.3019409179688,
"loss": 0.4487,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.7442741394042969,
"rewards/margins": 3.908841609954834,
"rewards/rejected": -5.653115272521973,
"step": 1460
},
{
"epoch": 0.35268714011516317,
"grad_norm": 10.938380648082374,
"learning_rate": 4.089167588389508e-07,
"logits/chosen": -1.0116260051727295,
"logits/rejected": -1.1302238702774048,
"logps/chosen": -524.224853515625,
"logps/rejected": -730.9097900390625,
"loss": 0.4594,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.8979120254516602,
"rewards/margins": 2.4903345108032227,
"rewards/rejected": -4.388247489929199,
"step": 1470
},
{
"epoch": 0.3550863723608445,
"grad_norm": 17.953634346330745,
"learning_rate": 4.072948388088515e-07,
"logits/chosen": -1.1399272680282593,
"logits/rejected": -1.185240387916565,
"logps/chosen": -480.47869873046875,
"logps/rejected": -711.10107421875,
"loss": 0.4749,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.0881600379943848,
"rewards/margins": 2.2733359336853027,
"rewards/rejected": -4.3614959716796875,
"step": 1480
},
{
"epoch": 0.3574856046065259,
"grad_norm": 15.479182257867892,
"learning_rate": 4.056618851707334e-07,
"logits/chosen": -1.2174913883209229,
"logits/rejected": -1.3078429698944092,
"logps/chosen": -448.73626708984375,
"logps/rejected": -778.0083618164062,
"loss": 0.4081,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.7068407535552979,
"rewards/margins": 3.3156890869140625,
"rewards/rejected": -5.022529602050781,
"step": 1490
},
{
"epoch": 0.3598848368522073,
"grad_norm": 12.258518175047803,
"learning_rate": 4.0401801246980675e-07,
"logits/chosen": -1.3552016019821167,
"logits/rejected": -1.4195324182510376,
"logps/chosen": -483.03607177734375,
"logps/rejected": -829.8291015625,
"loss": 0.4493,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.638827323913574,
"rewards/margins": 3.691737651824951,
"rewards/rejected": -6.330564498901367,
"step": 1500
},
{
"epoch": 0.3622840690978887,
"grad_norm": 12.234674147688299,
"learning_rate": 4.0236333601721043e-07,
"logits/chosen": -1.267978310585022,
"logits/rejected": -1.2512781620025635,
"logps/chosen": -463.58636474609375,
"logps/rejected": -589.7911376953125,
"loss": 0.4873,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.8561378717422485,
"rewards/margins": 1.1768932342529297,
"rewards/rejected": -3.0330309867858887,
"step": 1510
},
{
"epoch": 0.3646833013435701,
"grad_norm": 14.004595080556923,
"learning_rate": 4.0069797188192364e-07,
"logits/chosen": -1.1959376335144043,
"logits/rejected": -1.1977354288101196,
"logps/chosen": -557.3121337890625,
"logps/rejected": -894.349609375,
"loss": 0.4741,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.671919345855713,
"rewards/margins": 3.7750651836395264,
"rewards/rejected": -6.446984767913818,
"step": 1520
},
{
"epoch": 0.3670825335892514,
"grad_norm": 12.107627085595226,
"learning_rate": 3.9902203688262417e-07,
"logits/chosen": -1.2063888311386108,
"logits/rejected": -1.294390082359314,
"logps/chosen": -402.6402282714844,
"logps/rejected": -547.6096801757812,
"loss": 0.4266,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.433250904083252,
"rewards/margins": 1.5607125759124756,
"rewards/rejected": -2.9939634799957275,
"step": 1530
},
{
"epoch": 0.3694817658349328,
"grad_norm": 17.020506697194886,
"learning_rate": 3.9733564857949365e-07,
"logits/chosen": -1.2349357604980469,
"logits/rejected": -1.3519177436828613,
"logps/chosen": -479.85992431640625,
"logps/rejected": -652.7718505859375,
"loss": 0.4142,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8144111633300781,
"rewards/margins": 2.1435914039611816,
"rewards/rejected": -3.958002805709839,
"step": 1540
},
{
"epoch": 0.3718809980806142,
"grad_norm": 17.340247743555018,
"learning_rate": 3.9563892526597177e-07,
"logits/chosen": -1.342357873916626,
"logits/rejected": -1.2987167835235596,
"logps/chosen": -361.8048400878906,
"logps/rejected": -495.60577392578125,
"loss": 0.4381,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3809950351715088,
"rewards/margins": 0.9643322825431824,
"rewards/rejected": -2.345327138900757,
"step": 1550
},
{
"epoch": 0.3742802303262956,
"grad_norm": 12.57132648844664,
"learning_rate": 3.9393198596045795e-07,
"logits/chosen": -1.2646602392196655,
"logits/rejected": -1.227052927017212,
"logps/chosen": -390.56402587890625,
"logps/rejected": -566.3883666992188,
"loss": 0.4795,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.529387354850769,
"rewards/margins": 1.646651029586792,
"rewards/rejected": -3.176038980484009,
"step": 1560
},
{
"epoch": 0.376679462571977,
"grad_norm": 9.769186078821475,
"learning_rate": 3.922149503979628e-07,
"logits/chosen": -1.0911178588867188,
"logits/rejected": -1.1452914476394653,
"logps/chosen": -446.0682678222656,
"logps/rejected": -891.4172973632812,
"loss": 0.421,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7286949157714844,
"rewards/margins": 4.412930011749268,
"rewards/rejected": -6.141624927520752,
"step": 1570
},
{
"epoch": 0.3790786948176583,
"grad_norm": 15.13387973536505,
"learning_rate": 3.904879390217095e-07,
"logits/chosen": -1.2228879928588867,
"logits/rejected": -1.2863072156906128,
"logps/chosen": -414.58251953125,
"logps/rejected": -585.94189453125,
"loss": 0.4376,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5823562145233154,
"rewards/margins": 1.9467941522598267,
"rewards/rejected": -3.529151201248169,
"step": 1580
},
{
"epoch": 0.3814779270633397,
"grad_norm": 15.692773841294583,
"learning_rate": 3.8875107297468463e-07,
"logits/chosen": -1.1607332229614258,
"logits/rejected": -1.1135156154632568,
"logps/chosen": -394.6246032714844,
"logps/rejected": -685.8594970703125,
"loss": 0.4893,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.5657716989517212,
"rewards/margins": 2.4296836853027344,
"rewards/rejected": -3.995455503463745,
"step": 1590
},
{
"epoch": 0.3838771593090211,
"grad_norm": 12.22013342174461,
"learning_rate": 3.87004474091141e-07,
"logits/chosen": -1.0785077810287476,
"logits/rejected": -1.145101547241211,
"logps/chosen": -388.1426696777344,
"logps/rejected": -562.6093139648438,
"loss": 0.45,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.6640561819076538,
"rewards/margins": 1.593210220336914,
"rewards/rejected": -3.2572665214538574,
"step": 1600
},
{
"epoch": 0.3862763915547025,
"grad_norm": 12.125863891345588,
"learning_rate": 3.8524826488805114e-07,
"logits/chosen": -1.2912501096725464,
"logits/rejected": -1.281894326210022,
"logps/chosen": -448.44403076171875,
"logps/rejected": -577.4244384765625,
"loss": 0.4996,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6368385553359985,
"rewards/margins": 1.7040477991104126,
"rewards/rejected": -3.3408865928649902,
"step": 1610
},
{
"epoch": 0.3886756238003839,
"grad_norm": 14.772144560930354,
"learning_rate": 3.834825685565133e-07,
"logits/chosen": -1.2755136489868164,
"logits/rejected": -1.3778313398361206,
"logps/chosen": -365.7793273925781,
"logps/rejected": -445.6787109375,
"loss": 0.4148,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.2803051471710205,
"rewards/margins": 1.1510274410247803,
"rewards/rejected": -2.43133282661438,
"step": 1620
},
{
"epoch": 0.39107485604606523,
"grad_norm": 19.441822036099662,
"learning_rate": 3.8170750895311007e-07,
"logits/chosen": -1.168717622756958,
"logits/rejected": -1.1627219915390015,
"logps/chosen": -418.5533752441406,
"logps/rejected": -577.6104736328125,
"loss": 0.4001,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3612974882125854,
"rewards/margins": 1.8237950801849365,
"rewards/rejected": -3.1850924491882324,
"step": 1630
},
{
"epoch": 0.3934740882917466,
"grad_norm": 15.432355872695538,
"learning_rate": 3.7992321059122045e-07,
"logits/chosen": -1.1575825214385986,
"logits/rejected": -1.2952228784561157,
"logps/chosen": -471.82476806640625,
"logps/rejected": -670.2379150390625,
"loss": 0.4553,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.2020068168640137,
"rewards/margins": 2.1685726642608643,
"rewards/rejected": -4.370579242706299,
"step": 1640
},
{
"epoch": 0.395873320537428,
"grad_norm": 12.344277319747519,
"learning_rate": 3.7812979863228576e-07,
"logits/chosen": -1.3181376457214355,
"logits/rejected": -1.3618929386138916,
"logps/chosen": -485.92034912109375,
"logps/rejected": -651.4783935546875,
"loss": 0.4458,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.687350034713745,
"rewards/margins": 1.5618057250976562,
"rewards/rejected": -4.2491559982299805,
"step": 1650
},
{
"epoch": 0.3982725527831094,
"grad_norm": 15.718103189453073,
"learning_rate": 3.763273988770296e-07,
"logits/chosen": -1.1789578199386597,
"logits/rejected": -1.2662181854248047,
"logps/chosen": -411.53680419921875,
"logps/rejected": -600.5362548828125,
"loss": 0.4555,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.7146021127700806,
"rewards/margins": 1.884450912475586,
"rewards/rejected": -3.599053144454956,
"step": 1660
},
{
"epoch": 0.4006717850287908,
"grad_norm": 12.906974103488265,
"learning_rate": 3.7451613775663405e-07,
"logits/chosen": -1.1591131687164307,
"logits/rejected": -1.1079394817352295,
"logps/chosen": -392.81610107421875,
"logps/rejected": -686.8541259765625,
"loss": 0.4617,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5098912715911865,
"rewards/margins": 2.9055066108703613,
"rewards/rejected": -4.415398120880127,
"step": 1670
},
{
"epoch": 0.40307101727447214,
"grad_norm": 17.744184430696986,
"learning_rate": 3.726961423238706e-07,
"logits/chosen": -1.2879854440689087,
"logits/rejected": -1.2753901481628418,
"logps/chosen": -382.4233703613281,
"logps/rejected": -630.2597045898438,
"loss": 0.4446,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5524839162826538,
"rewards/margins": 2.3134608268737793,
"rewards/rejected": -3.8659446239471436,
"step": 1680
},
{
"epoch": 0.40547024952015354,
"grad_norm": 15.938147948338413,
"learning_rate": 3.708675402441882e-07,
"logits/chosen": -1.146555781364441,
"logits/rejected": -1.3221074342727661,
"logps/chosen": -438.2669372558594,
"logps/rejected": -592.5673217773438,
"loss": 0.4849,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6606773138046265,
"rewards/margins": 1.8763577938079834,
"rewards/rejected": -3.5370349884033203,
"step": 1690
},
{
"epoch": 0.40786948176583493,
"grad_norm": 15.397723594978256,
"learning_rate": 3.6903045978675775e-07,
"logits/chosen": -1.1809333562850952,
"logits/rejected": -1.2159626483917236,
"logps/chosen": -393.14300537109375,
"logps/rejected": -634.4575805664062,
"loss": 0.4445,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.5431654453277588,
"rewards/margins": 2.6182663440704346,
"rewards/rejected": -4.161431789398193,
"step": 1700
},
{
"epoch": 0.4102687140115163,
"grad_norm": 12.520789981032921,
"learning_rate": 3.6718502981547474e-07,
"logits/chosen": -1.2585941553115845,
"logits/rejected": -1.2499480247497559,
"logps/chosen": -395.173828125,
"logps/rejected": -598.7399291992188,
"loss": 0.4262,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.478262186050415,
"rewards/margins": 1.6322393417358398,
"rewards/rejected": -3.110501527786255,
"step": 1710
},
{
"epoch": 0.4126679462571977,
"grad_norm": 14.28113094156497,
"learning_rate": 3.6533137977991986e-07,
"logits/chosen": -1.1053855419158936,
"logits/rejected": -1.1341431140899658,
"logps/chosen": -424.7064514160156,
"logps/rejected": -587.2310791015625,
"loss": 0.5011,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.4273463487625122,
"rewards/margins": 1.3417994976043701,
"rewards/rejected": -2.769145965576172,
"step": 1720
},
{
"epoch": 0.41506717850287905,
"grad_norm": 11.265817004608927,
"learning_rate": 3.6346963970627865e-07,
"logits/chosen": -1.1037083864212036,
"logits/rejected": -1.0224764347076416,
"logps/chosen": -393.5522766113281,
"logps/rejected": -614.2374877929688,
"loss": 0.4456,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5364679098129272,
"rewards/margins": 2.105459690093994,
"rewards/rejected": -3.641927719116211,
"step": 1730
},
{
"epoch": 0.41746641074856045,
"grad_norm": 11.977913591571605,
"learning_rate": 3.615999401882207e-07,
"logits/chosen": -1.3552181720733643,
"logits/rejected": -1.3371044397354126,
"logps/chosen": -418.01397705078125,
"logps/rejected": -775.3851318359375,
"loss": 0.453,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.8135935068130493,
"rewards/margins": 3.5035622119903564,
"rewards/rejected": -5.317155361175537,
"step": 1740
},
{
"epoch": 0.41986564299424184,
"grad_norm": 11.503365691015834,
"learning_rate": 3.597224123777389e-07,
"logits/chosen": -1.2816386222839355,
"logits/rejected": -1.2878687381744385,
"logps/chosen": -511.79852294921875,
"logps/rejected": -887.9742431640625,
"loss": 0.447,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.595597743988037,
"rewards/margins": 3.664611339569092,
"rewards/rejected": -6.260209083557129,
"step": 1750
},
{
"epoch": 0.42226487523992323,
"grad_norm": 16.17377166072833,
"learning_rate": 3.5783718797595e-07,
"logits/chosen": -1.2984836101531982,
"logits/rejected": -1.399364709854126,
"logps/chosen": -505.53204345703125,
"logps/rejected": -702.7299194335938,
"loss": 0.4559,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.0332143306732178,
"rewards/margins": 2.416463851928711,
"rewards/rejected": -4.449678421020508,
"step": 1760
},
{
"epoch": 0.4246641074856046,
"grad_norm": 12.49839489285334,
"learning_rate": 3.559443992238558e-07,
"logits/chosen": -1.3506227731704712,
"logits/rejected": -1.409407615661621,
"logps/chosen": -397.6769714355469,
"logps/rejected": -840.98095703125,
"loss": 0.4406,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.526439905166626,
"rewards/margins": 4.196308135986328,
"rewards/rejected": -5.722747802734375,
"step": 1770
},
{
"epoch": 0.42706333973128596,
"grad_norm": 10.138858801029569,
"learning_rate": 3.540441788930673e-07,
"logits/chosen": -1.3410276174545288,
"logits/rejected": -1.414222002029419,
"logps/chosen": -491.10986328125,
"logps/rejected": -747.819091796875,
"loss": 0.4169,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.9301306009292603,
"rewards/margins": 2.9560461044311523,
"rewards/rejected": -4.886176109313965,
"step": 1780
},
{
"epoch": 0.42946257197696736,
"grad_norm": 14.446911222851618,
"learning_rate": 3.5213666027649123e-07,
"logits/chosen": -1.3879473209381104,
"logits/rejected": -1.5012261867523193,
"logps/chosen": -495.0065002441406,
"logps/rejected": -607.099853515625,
"loss": 0.4593,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.185396194458008,
"rewards/margins": 1.5175437927246094,
"rewards/rejected": -3.702939510345459,
"step": 1790
},
{
"epoch": 0.43186180422264875,
"grad_norm": 15.239290825165414,
"learning_rate": 3.5022197717898017e-07,
"logits/chosen": -1.2657089233398438,
"logits/rejected": -1.4074336290359497,
"logps/chosen": -394.63385009765625,
"logps/rejected": -730.5569458007812,
"loss": 0.3917,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.7406047582626343,
"rewards/margins": 3.7480709552764893,
"rewards/rejected": -5.488675594329834,
"step": 1800
},
{
"epoch": 0.43426103646833014,
"grad_norm": 18.539564378032264,
"learning_rate": 3.4830026390794633e-07,
"logits/chosen": -1.373583436012268,
"logits/rejected": -1.449741244316101,
"logps/chosen": -525.2714233398438,
"logps/rejected": -705.874755859375,
"loss": 0.4023,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.3716514110565186,
"rewards/margins": 2.285885810852051,
"rewards/rejected": -4.657536506652832,
"step": 1810
},
{
"epoch": 0.43666026871401153,
"grad_norm": 15.392536337629252,
"learning_rate": 3.4637165526394104e-07,
"logits/chosen": -1.4993913173675537,
"logits/rejected": -1.5507056713104248,
"logps/chosen": -438.17315673828125,
"logps/rejected": -669.3710327148438,
"loss": 0.4354,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.0398783683776855,
"rewards/margins": 2.2368016242980957,
"rewards/rejected": -4.276679515838623,
"step": 1820
},
{
"epoch": 0.43905950095969287,
"grad_norm": 10.083282846912516,
"learning_rate": 3.4443628653119814e-07,
"logits/chosen": -1.2605036497116089,
"logits/rejected": -1.272882103919983,
"logps/chosen": -459.35870361328125,
"logps/rejected": -793.3179931640625,
"loss": 0.4781,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.9037319421768188,
"rewards/margins": 2.950524091720581,
"rewards/rejected": -4.854256629943848,
"step": 1830
},
{
"epoch": 0.44145873320537427,
"grad_norm": 17.777066285729536,
"learning_rate": 3.424942934681453e-07,
"logits/chosen": -1.2640819549560547,
"logits/rejected": -1.4133803844451904,
"logps/chosen": -365.7422790527344,
"logps/rejected": -581.9953002929688,
"loss": 0.4291,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3039973974227905,
"rewards/margins": 2.1755900382995605,
"rewards/rejected": -3.4795870780944824,
"step": 1840
},
{
"epoch": 0.44385796545105566,
"grad_norm": 21.190089730860404,
"learning_rate": 3.405458122978804e-07,
"logits/chosen": -1.2760121822357178,
"logits/rejected": -1.3037431240081787,
"logps/chosen": -427.647216796875,
"logps/rejected": -589.5700073242188,
"loss": 0.4056,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.554579496383667,
"rewards/margins": 1.9800221920013428,
"rewards/rejected": -3.5346016883850098,
"step": 1850
},
{
"epoch": 0.44625719769673705,
"grad_norm": 19.52879089362984,
"learning_rate": 3.3859097969861633e-07,
"logits/chosen": -1.2224397659301758,
"logits/rejected": -1.2577435970306396,
"logps/chosen": -464.634033203125,
"logps/rejected": -659.4881591796875,
"loss": 0.4462,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7432029247283936,
"rewards/margins": 2.287348508834839,
"rewards/rejected": -4.030551433563232,
"step": 1860
},
{
"epoch": 0.44865642994241844,
"grad_norm": 15.71172140020582,
"learning_rate": 3.366299327940936e-07,
"logits/chosen": -1.254504680633545,
"logits/rejected": -1.188957929611206,
"logps/chosen": -473.3033142089844,
"logps/rejected": -726.52392578125,
"loss": 0.4147,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.8812453746795654,
"rewards/margins": 2.339358329772949,
"rewards/rejected": -4.2206034660339355,
"step": 1870
},
{
"epoch": 0.4510556621880998,
"grad_norm": 13.22641361891775,
"learning_rate": 3.3466280914396117e-07,
"logits/chosen": -1.2382128238677979,
"logits/rejected": -1.2535500526428223,
"logps/chosen": -443.01007080078125,
"logps/rejected": -681.6619262695312,
"loss": 0.4319,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.062727212905884,
"rewards/margins": 2.200326919555664,
"rewards/rejected": -4.2630534172058105,
"step": 1880
},
{
"epoch": 0.4534548944337812,
"grad_norm": 13.21370692192793,
"learning_rate": 3.326897467341281e-07,
"logits/chosen": -1.234937310218811,
"logits/rejected": -1.3355581760406494,
"logps/chosen": -445.6819763183594,
"logps/rejected": -728.594970703125,
"loss": 0.4254,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.3737926483154297,
"rewards/margins": 2.7364139556884766,
"rewards/rejected": -5.110206127166748,
"step": 1890
},
{
"epoch": 0.45585412667946257,
"grad_norm": 13.78564096806611,
"learning_rate": 3.3071088396708335e-07,
"logits/chosen": -1.2770905494689941,
"logits/rejected": -1.240928292274475,
"logps/chosen": -391.79736328125,
"logps/rejected": -723.361083984375,
"loss": 0.4619,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.881150484085083,
"rewards/margins": 3.0526633262634277,
"rewards/rejected": -4.93381404876709,
"step": 1900
},
{
"epoch": 0.45825335892514396,
"grad_norm": 13.853814670591044,
"learning_rate": 3.2872635965218824e-07,
"logits/chosen": -1.2004590034484863,
"logits/rejected": -1.2370150089263916,
"logps/chosen": -512.8619995117188,
"logps/rejected": -702.7752685546875,
"loss": 0.4878,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.5109477043151855,
"rewards/margins": 1.8365228176116943,
"rewards/rejected": -4.347470283508301,
"step": 1910
},
{
"epoch": 0.46065259117082535,
"grad_norm": 10.692863270526209,
"learning_rate": 3.2673631299593905e-07,
"logits/chosen": -1.173227310180664,
"logits/rejected": -1.3409078121185303,
"logps/chosen": -474.7144470214844,
"logps/rejected": -680.1087036132812,
"loss": 0.4399,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.028782606124878,
"rewards/margins": 2.1353182792663574,
"rewards/rejected": -4.164100646972656,
"step": 1920
},
{
"epoch": 0.4630518234165067,
"grad_norm": 14.233116791502368,
"learning_rate": 3.247408835922024e-07,
"logits/chosen": -1.320565104484558,
"logits/rejected": -1.2968575954437256,
"logps/chosen": -573.8089599609375,
"logps/rejected": -786.044189453125,
"loss": 0.4368,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.6720101833343506,
"rewards/margins": 2.062215566635132,
"rewards/rejected": -4.734226226806641,
"step": 1930
},
{
"epoch": 0.4654510556621881,
"grad_norm": 15.685451696438584,
"learning_rate": 3.2274021141242306e-07,
"logits/chosen": -1.255118489265442,
"logits/rejected": -1.320111870765686,
"logps/chosen": -484.41583251953125,
"logps/rejected": -714.25244140625,
"loss": 0.4476,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.30792236328125,
"rewards/margins": 2.1702935695648193,
"rewards/rejected": -4.47821569442749,
"step": 1940
},
{
"epoch": 0.4678502879078695,
"grad_norm": 16.009709587203158,
"learning_rate": 3.2073443679580613e-07,
"logits/chosen": -1.112594723701477,
"logits/rejected": -1.2054228782653809,
"logps/chosen": -423.2367248535156,
"logps/rejected": -543.6708984375,
"loss": 0.4427,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.6221225261688232,
"rewards/margins": 1.2181508541107178,
"rewards/rejected": -2.840273380279541,
"step": 1950
},
{
"epoch": 0.47024952015355087,
"grad_norm": 11.690186448497471,
"learning_rate": 3.1872370043947194e-07,
"logits/chosen": -1.2797791957855225,
"logits/rejected": -1.3246345520019531,
"logps/chosen": -414.15234375,
"logps/rejected": -695.158203125,
"loss": 0.4033,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.4832648038864136,
"rewards/margins": 2.9206230640411377,
"rewards/rejected": -4.40388822555542,
"step": 1960
},
{
"epoch": 0.47264875239923226,
"grad_norm": 20.104770187290267,
"learning_rate": 3.167081433885874e-07,
"logits/chosen": -1.030788779258728,
"logits/rejected": -1.1015684604644775,
"logps/chosen": -513.7273559570312,
"logps/rejected": -811.976318359375,
"loss": 0.3855,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.102285623550415,
"rewards/margins": 2.593528985977173,
"rewards/rejected": -4.695814609527588,
"step": 1970
},
{
"epoch": 0.4750479846449136,
"grad_norm": 22.35702689198335,
"learning_rate": 3.14687907026472e-07,
"logits/chosen": -1.1791191101074219,
"logits/rejected": -1.2959524393081665,
"logps/chosen": -441.61846923828125,
"logps/rejected": -704.8765869140625,
"loss": 0.409,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.2501838207244873,
"rewards/margins": 2.3681230545043945,
"rewards/rejected": -4.618307113647461,
"step": 1980
},
{
"epoch": 0.477447216890595,
"grad_norm": 16.66080453274856,
"learning_rate": 3.126631330646801e-07,
"logits/chosen": -1.2385426759719849,
"logits/rejected": -1.3393757343292236,
"logps/chosen": -572.3016357421875,
"logps/rejected": -789.0521850585938,
"loss": 0.4564,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.771270275115967,
"rewards/margins": 2.124401807785034,
"rewards/rejected": -4.895671844482422,
"step": 1990
},
{
"epoch": 0.4798464491362764,
"grad_norm": 13.605651174190195,
"learning_rate": 3.1063396353306097e-07,
"logits/chosen": -1.2395048141479492,
"logits/rejected": -1.3828445672988892,
"logps/chosen": -438.51312255859375,
"logps/rejected": -621.2369384765625,
"loss": 0.4486,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7979936599731445,
"rewards/margins": 2.1893625259399414,
"rewards/rejected": -3.987356185913086,
"step": 2000
},
{
"epoch": 0.4822456813819578,
"grad_norm": 13.924076058423626,
"learning_rate": 3.0860054076979535e-07,
"logits/chosen": -1.2428590059280396,
"logits/rejected": -1.236566185951233,
"logps/chosen": -467.281005859375,
"logps/rejected": -621.5033569335938,
"loss": 0.4229,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.916424036026001,
"rewards/margins": 1.8854621648788452,
"rewards/rejected": -3.8018863201141357,
"step": 2010
},
{
"epoch": 0.4846449136276392,
"grad_norm": 10.659549298550333,
"learning_rate": 3.065630074114115e-07,
"logits/chosen": -1.2107280492782593,
"logits/rejected": -1.2906858921051025,
"logps/chosen": -437.7901916503906,
"logps/rejected": -730.5772705078125,
"loss": 0.4458,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.5543664693832397,
"rewards/margins": 3.3551418781280518,
"rewards/rejected": -4.90950870513916,
"step": 2020
},
{
"epoch": 0.4870441458733205,
"grad_norm": 11.89640175081092,
"learning_rate": 3.0452150638277947e-07,
"logits/chosen": -1.0864282846450806,
"logits/rejected": -1.0442813634872437,
"logps/chosen": -426.6461486816406,
"logps/rejected": -630.3248291015625,
"loss": 0.4587,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.0740158557891846,
"rewards/margins": 1.9162708520889282,
"rewards/rejected": -3.9902865886688232,
"step": 2030
},
{
"epoch": 0.4894433781190019,
"grad_norm": 11.776367189768084,
"learning_rate": 3.024761808870856e-07,
"logits/chosen": -1.307191014289856,
"logits/rejected": -1.312293291091919,
"logps/chosen": -370.92791748046875,
"logps/rejected": -650.8178100585938,
"loss": 0.3818,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.276861310005188,
"rewards/margins": 2.857835531234741,
"rewards/rejected": -4.1346964836120605,
"step": 2040
},
{
"epoch": 0.4918426103646833,
"grad_norm": 25.099913631834486,
"learning_rate": 3.004271743957875e-07,
"logits/chosen": -1.1278274059295654,
"logits/rejected": -1.1356306076049805,
"logps/chosen": -495.9942321777344,
"logps/rejected": -648.2416381835938,
"loss": 0.4424,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.38657808303833,
"rewards/margins": 1.207850694656372,
"rewards/rejected": -3.594428539276123,
"step": 2050
},
{
"epoch": 0.4942418426103647,
"grad_norm": 12.705340632883466,
"learning_rate": 2.983746306385499e-07,
"logits/chosen": -1.3075058460235596,
"logits/rejected": -1.2933059930801392,
"logps/chosen": -434.21820068359375,
"logps/rejected": -625.4400024414062,
"loss": 0.4253,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8787765502929688,
"rewards/margins": 1.7934147119522095,
"rewards/rejected": -3.6721911430358887,
"step": 2060
},
{
"epoch": 0.4966410748560461,
"grad_norm": 13.755685476664564,
"learning_rate": 2.963186935931628e-07,
"logits/chosen": -1.215529203414917,
"logits/rejected": -1.2133440971374512,
"logps/chosen": -403.08465576171875,
"logps/rejected": -577.1477661132812,
"loss": 0.3959,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.499050259590149,
"rewards/margins": 1.7851158380508423,
"rewards/rejected": -3.284165859222412,
"step": 2070
},
{
"epoch": 0.4990403071017274,
"grad_norm": 13.292089170044676,
"learning_rate": 2.9425950747544176e-07,
"logits/chosen": -1.1674937009811401,
"logits/rejected": -1.3074105978012085,
"logps/chosen": -525.9224243164062,
"logps/rejected": -763.0833740234375,
"loss": 0.423,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.349907398223877,
"rewards/margins": 2.648500919342041,
"rewards/rejected": -4.998408317565918,
"step": 2080
},
{
"epoch": 0.5014395393474088,
"grad_norm": 18.455895560701023,
"learning_rate": 2.921972167291119e-07,
"logits/chosen": -1.1788341999053955,
"logits/rejected": -1.2539576292037964,
"logps/chosen": -472.1956481933594,
"logps/rejected": -670.5008544921875,
"loss": 0.4338,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9172461032867432,
"rewards/margins": 2.0367343425750732,
"rewards/rejected": -3.9539802074432373,
"step": 2090
},
{
"epoch": 0.5038387715930902,
"grad_norm": 13.471771310006998,
"learning_rate": 2.9013196601567567e-07,
"logits/chosen": -1.1498690843582153,
"logits/rejected": -1.1755589246749878,
"logps/chosen": -407.24163818359375,
"logps/rejected": -575.2977905273438,
"loss": 0.5056,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5899887084960938,
"rewards/margins": 1.5084218978881836,
"rewards/rejected": -3.0984106063842773,
"step": 2100
},
{
"epoch": 0.5062380038387716,
"grad_norm": 15.761320703617217,
"learning_rate": 2.8806390020426555e-07,
"logits/chosen": -1.139108419418335,
"logits/rejected": -1.1348917484283447,
"logps/chosen": -422.84893798828125,
"logps/rejected": -555.7877807617188,
"loss": 0.4274,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.4848568439483643,
"rewards/margins": 1.3276598453521729,
"rewards/rejected": -2.812516689300537,
"step": 2110
},
{
"epoch": 0.508637236084453,
"grad_norm": 17.997023645014245,
"learning_rate": 2.8599316436148187e-07,
"logits/chosen": -1.2534973621368408,
"logits/rejected": -1.2950940132141113,
"logps/chosen": -413.71490478515625,
"logps/rejected": -538.5707397460938,
"loss": 0.446,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.67266047000885,
"rewards/margins": 1.2678474187850952,
"rewards/rejected": -2.9405078887939453,
"step": 2120
},
{
"epoch": 0.5110364683301344,
"grad_norm": 12.638370779776375,
"learning_rate": 2.8391990374121723e-07,
"logits/chosen": -1.288747787475586,
"logits/rejected": -1.3166449069976807,
"logps/chosen": -454.18359375,
"logps/rejected": -724.0892333984375,
"loss": 0.4079,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.0871493816375732,
"rewards/margins": 2.4171881675720215,
"rewards/rejected": -4.504337310791016,
"step": 2130
},
{
"epoch": 0.5134357005758158,
"grad_norm": 19.25266010844361,
"learning_rate": 2.818442637744669e-07,
"logits/chosen": -1.332960844039917,
"logits/rejected": -1.3756061792373657,
"logps/chosen": -447.2140197753906,
"logps/rejected": -657.7930297851562,
"loss": 0.4256,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.023087978363037,
"rewards/margins": 2.0285849571228027,
"rewards/rejected": -4.05167293548584,
"step": 2140
},
{
"epoch": 0.5158349328214972,
"grad_norm": 17.434973094318284,
"learning_rate": 2.797663900591284e-07,
"logits/chosen": -1.320966124534607,
"logits/rejected": -1.4106026887893677,
"logps/chosen": -476.49267578125,
"logps/rejected": -632.2120361328125,
"loss": 0.3841,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.1607255935668945,
"rewards/margins": 1.7891597747802734,
"rewards/rejected": -3.949885129928589,
"step": 2150
},
{
"epoch": 0.5182341650671785,
"grad_norm": 17.474868196021465,
"learning_rate": 2.776864283497874e-07,
"logits/chosen": -1.2660505771636963,
"logits/rejected": -1.3750221729278564,
"logps/chosen": -442.905029296875,
"logps/rejected": -755.1177978515625,
"loss": 0.4058,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.1417970657348633,
"rewards/margins": 3.1892342567443848,
"rewards/rejected": -5.331031322479248,
"step": 2160
},
{
"epoch": 0.5206333973128598,
"grad_norm": 14.837534970558897,
"learning_rate": 2.756045245474943e-07,
"logits/chosen": -1.1648900508880615,
"logits/rejected": -1.1385093927383423,
"logps/chosen": -472.541015625,
"logps/rejected": -685.985107421875,
"loss": 0.4347,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.109835147857666,
"rewards/margins": 2.0320799350738525,
"rewards/rejected": -4.1419148445129395,
"step": 2170
},
{
"epoch": 0.5230326295585412,
"grad_norm": 12.732468692827648,
"learning_rate": 2.7352082468952977e-07,
"logits/chosen": -1.1894464492797852,
"logits/rejected": -1.2552679777145386,
"logps/chosen": -484.76409912109375,
"logps/rejected": -814.096435546875,
"loss": 0.4487,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.386613130569458,
"rewards/margins": 3.1459081172943115,
"rewards/rejected": -5.532520771026611,
"step": 2180
},
{
"epoch": 0.5254318618042226,
"grad_norm": 15.991158601320864,
"learning_rate": 2.7143547493916e-07,
"logits/chosen": -1.2773798704147339,
"logits/rejected": -1.2588646411895752,
"logps/chosen": -438.006591796875,
"logps/rejected": -779.8575439453125,
"loss": 0.4461,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.8076789379119873,
"rewards/margins": 3.383274793624878,
"rewards/rejected": -5.190953254699707,
"step": 2190
},
{
"epoch": 0.527831094049904,
"grad_norm": 15.0413189840848,
"learning_rate": 2.693486215753853e-07,
"logits/chosen": -1.2713805437088013,
"logits/rejected": -1.2931923866271973,
"logps/chosen": -474.3173828125,
"logps/rejected": -771.8619384765625,
"loss": 0.4418,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.2440552711486816,
"rewards/margins": 3.1775963306427,
"rewards/rejected": -5.421651840209961,
"step": 2200
},
{
"epoch": 0.5302303262955854,
"grad_norm": 12.016086648025393,
"learning_rate": 2.6726041098267805e-07,
"logits/chosen": -1.1932638883590698,
"logits/rejected": -1.2385377883911133,
"logps/chosen": -468.5335998535156,
"logps/rejected": -564.7030029296875,
"loss": 0.4914,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.7891786098480225,
"rewards/margins": 1.3260656595230103,
"rewards/rejected": -3.1152443885803223,
"step": 2210
},
{
"epoch": 0.5326295585412668,
"grad_norm": 15.752188949724715,
"learning_rate": 2.6517098964071507e-07,
"logits/chosen": -1.2558810710906982,
"logits/rejected": -1.3106260299682617,
"logps/chosen": -388.71038818359375,
"logps/rejected": -497.370361328125,
"loss": 0.4766,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.3813974857330322,
"rewards/margins": 0.8968814015388489,
"rewards/rejected": -2.2782788276672363,
"step": 2220
},
{
"epoch": 0.5350287907869482,
"grad_norm": 17.371358487239537,
"learning_rate": 2.630805041141023e-07,
"logits/chosen": -1.3759686946868896,
"logits/rejected": -1.3909227848052979,
"logps/chosen": -353.22833251953125,
"logps/rejected": -674.2232666015625,
"loss": 0.4387,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.275880217552185,
"rewards/margins": 2.9993698596954346,
"rewards/rejected": -4.275249481201172,
"step": 2230
},
{
"epoch": 0.5374280230326296,
"grad_norm": 15.68257354415993,
"learning_rate": 2.609891010420941e-07,
"logits/chosen": -1.3854516744613647,
"logits/rejected": -1.3968112468719482,
"logps/chosen": -449.6177673339844,
"logps/rejected": -670.1103515625,
"loss": 0.4195,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.7667739391326904,
"rewards/margins": 2.2145369052886963,
"rewards/rejected": -3.981311082839966,
"step": 2240
},
{
"epoch": 0.539827255278311,
"grad_norm": 17.255086501798605,
"learning_rate": 2.5889692712830674e-07,
"logits/chosen": -1.211531400680542,
"logits/rejected": -1.283849835395813,
"logps/chosen": -389.5179138183594,
"logps/rejected": -602.1033325195312,
"loss": 0.3883,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.6021864414215088,
"rewards/margins": 2.1839447021484375,
"rewards/rejected": -3.786130905151367,
"step": 2250
},
{
"epoch": 0.5422264875239923,
"grad_norm": 23.697806480913663,
"learning_rate": 2.5680412913042843e-07,
"logits/chosen": -1.4293580055236816,
"logits/rejected": -1.4167674779891968,
"logps/chosen": -482.10028076171875,
"logps/rejected": -786.3612060546875,
"loss": 0.4341,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.2943575382232666,
"rewards/margins": 3.070589065551758,
"rewards/rejected": -5.3649468421936035,
"step": 2260
},
{
"epoch": 0.5446257197696737,
"grad_norm": 20.5376691734358,
"learning_rate": 2.5471085384992404e-07,
"logits/chosen": -1.3646225929260254,
"logits/rejected": -1.3295977115631104,
"logps/chosen": -490.397705078125,
"logps/rejected": -880.2039184570312,
"loss": 0.4117,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.3713932037353516,
"rewards/margins": 3.905015468597412,
"rewards/rejected": -6.2764081954956055,
"step": 2270
},
{
"epoch": 0.5470249520153551,
"grad_norm": 33.70464819485353,
"learning_rate": 2.526172481217381e-07,
"logits/chosen": -1.345577597618103,
"logits/rejected": -1.318164587020874,
"logps/chosen": -443.73492431640625,
"logps/rejected": -621.4215698242188,
"loss": 0.4513,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.3964602947235107,
"rewards/margins": 1.5979655981063843,
"rewards/rejected": -3.9944260120391846,
"step": 2280
},
{
"epoch": 0.5494241842610365,
"grad_norm": 14.933194013231358,
"learning_rate": 2.5052345880399456e-07,
"logits/chosen": -1.3429863452911377,
"logits/rejected": -1.419154405593872,
"logps/chosen": -432.6609802246094,
"logps/rejected": -600.9601440429688,
"loss": 0.423,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.0265350341796875,
"rewards/margins": 1.6890513896942139,
"rewards/rejected": -3.7155869007110596,
"step": 2290
},
{
"epoch": 0.5518234165067178,
"grad_norm": 16.16343931231014,
"learning_rate": 2.4842963276769555e-07,
"logits/chosen": -1.3177053928375244,
"logits/rejected": -1.2920299768447876,
"logps/chosen": -412.7960510253906,
"logps/rejected": -659.1737060546875,
"loss": 0.4355,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.99508535861969,
"rewards/margins": 2.033306360244751,
"rewards/rejected": -4.0283918380737305,
"step": 2300
},
{
"epoch": 0.5542226487523992,
"grad_norm": 15.4620970327947,
"learning_rate": 2.463359168864189e-07,
"logits/chosen": -1.1979784965515137,
"logits/rejected": -1.3762614727020264,
"logps/chosen": -500.4812927246094,
"logps/rejected": -641.75537109375,
"loss": 0.4751,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.1022884845733643,
"rewards/margins": 1.735335111618042,
"rewards/rejected": -3.8376235961914062,
"step": 2310
},
{
"epoch": 0.5566218809980806,
"grad_norm": 17.931731909002746,
"learning_rate": 2.4424245802601555e-07,
"logits/chosen": -1.2692848443984985,
"logits/rejected": -1.2660033702850342,
"logps/chosen": -370.6279296875,
"logps/rejected": -573.5665283203125,
"loss": 0.4162,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5542004108428955,
"rewards/margins": 1.4760851860046387,
"rewards/rejected": -3.0302860736846924,
"step": 2320
},
{
"epoch": 0.559021113243762,
"grad_norm": 17.293463950337944,
"learning_rate": 2.421494030343072e-07,
"logits/chosen": -1.2170095443725586,
"logits/rejected": -1.3737263679504395,
"logps/chosen": -454.30224609375,
"logps/rejected": -541.3650512695312,
"loss": 0.5052,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.897562026977539,
"rewards/margins": 1.3505643606185913,
"rewards/rejected": -3.248126268386841,
"step": 2330
},
{
"epoch": 0.5614203454894434,
"grad_norm": 14.543185750157452,
"learning_rate": 2.400568987307861e-07,
"logits/chosen": -1.2438604831695557,
"logits/rejected": -1.3291311264038086,
"logps/chosen": -413.92144775390625,
"logps/rejected": -483.96240234375,
"loss": 0.3981,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.790085792541504,
"rewards/margins": 0.866573691368103,
"rewards/rejected": -2.6566593647003174,
"step": 2340
},
{
"epoch": 0.5638195777351248,
"grad_norm": 12.815153330284657,
"learning_rate": 2.379650918963156e-07,
"logits/chosen": -1.3083336353302002,
"logits/rejected": -1.3296959400177002,
"logps/chosen": -408.14068603515625,
"logps/rejected": -637.8969116210938,
"loss": 0.4107,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.137500524520874,
"rewards/margins": 2.198113203048706,
"rewards/rejected": -4.33561372756958,
"step": 2350
},
{
"epoch": 0.5662188099808061,
"grad_norm": 19.62558947066615,
"learning_rate": 2.3587412926283438e-07,
"logits/chosen": -1.3259559869766235,
"logits/rejected": -1.3641198873519897,
"logps/chosen": -511.7887268066406,
"logps/rejected": -747.3419189453125,
"loss": 0.4316,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.02677845954895,
"rewards/margins": 2.806626796722412,
"rewards/rejected": -4.833405017852783,
"step": 2360
},
{
"epoch": 0.5686180422264875,
"grad_norm": 21.00689689996569,
"learning_rate": 2.337841575030642e-07,
"logits/chosen": -1.1748692989349365,
"logits/rejected": -1.220413327217102,
"logps/chosen": -463.1639099121094,
"logps/rejected": -683.6637573242188,
"loss": 0.3981,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.8594157695770264,
"rewards/margins": 2.1025776863098145,
"rewards/rejected": -3.96199369430542,
"step": 2370
},
{
"epoch": 0.5710172744721689,
"grad_norm": 16.379604246595516,
"learning_rate": 2.316953232202206e-07,
"logits/chosen": -1.2951033115386963,
"logits/rejected": -1.4975831508636475,
"logps/chosen": -450.65155029296875,
"logps/rejected": -539.8964233398438,
"loss": 0.4165,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.0984275341033936,
"rewards/margins": 1.536707878112793,
"rewards/rejected": -3.6351349353790283,
"step": 2380
},
{
"epoch": 0.5734165067178503,
"grad_norm": 13.015590564989903,
"learning_rate": 2.2960777293772958e-07,
"logits/chosen": -1.2712581157684326,
"logits/rejected": -1.3997230529785156,
"logps/chosen": -406.917236328125,
"logps/rejected": -671.5166625976562,
"loss": 0.4366,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.939073920249939,
"rewards/margins": 2.8481147289276123,
"rewards/rejected": -4.787188529968262,
"step": 2390
},
{
"epoch": 0.5758157389635317,
"grad_norm": 13.201413526903568,
"learning_rate": 2.2752165308894974e-07,
"logits/chosen": -1.2536894083023071,
"logits/rejected": -1.2781312465667725,
"logps/chosen": -368.1941833496094,
"logps/rejected": -593.6776123046875,
"loss": 0.4295,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.6818746328353882,
"rewards/margins": 2.3582541942596436,
"rewards/rejected": -4.040129661560059,
"step": 2400
},
{
"epoch": 0.5782149712092131,
"grad_norm": 12.990669417950757,
"learning_rate": 2.254371100069005e-07,
"logits/chosen": -1.1515527963638306,
"logits/rejected": -1.1078431606292725,
"logps/chosen": -375.81732177734375,
"logps/rejected": -583.7494506835938,
"loss": 0.3986,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.465872049331665,
"rewards/margins": 1.888287901878357,
"rewards/rejected": -3.3541598320007324,
"step": 2410
},
{
"epoch": 0.5806142034548945,
"grad_norm": 17.818623986164003,
"learning_rate": 2.2335428991399725e-07,
"logits/chosen": -1.242234468460083,
"logits/rejected": -1.2769407033920288,
"logps/chosen": -519.7703857421875,
"logps/rejected": -948.0362548828125,
"loss": 0.403,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.072312116622925,
"rewards/margins": 4.26052188873291,
"rewards/rejected": -7.332834720611572,
"step": 2420
},
{
"epoch": 0.5830134357005758,
"grad_norm": 14.48204764987916,
"learning_rate": 2.2127333891179458e-07,
"logits/chosen": -1.3385140895843506,
"logits/rejected": -1.3878307342529297,
"logps/chosen": -430.9832458496094,
"logps/rejected": -745.4675903320312,
"loss": 0.4398,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.236307144165039,
"rewards/margins": 2.929969310760498,
"rewards/rejected": -5.166275978088379,
"step": 2430
},
{
"epoch": 0.5854126679462572,
"grad_norm": 27.82308149849405,
"learning_rate": 2.1919440297073782e-07,
"logits/chosen": -1.271857500076294,
"logits/rejected": -1.3383657932281494,
"logps/chosen": -462.377685546875,
"logps/rejected": -775.3133544921875,
"loss": 0.4672,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.4519903659820557,
"rewards/margins": 3.0875110626220703,
"rewards/rejected": -5.539501190185547,
"step": 2440
},
{
"epoch": 0.5878119001919386,
"grad_norm": 13.320769149199382,
"learning_rate": 2.1711762791992368e-07,
"logits/chosen": -1.2566578388214111,
"logits/rejected": -1.29204261302948,
"logps/chosen": -504.62371826171875,
"logps/rejected": -647.8597412109375,
"loss": 0.4486,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.186826705932617,
"rewards/margins": 1.7466872930526733,
"rewards/rejected": -3.9335131645202637,
"step": 2450
},
{
"epoch": 0.5902111324376199,
"grad_norm": 15.89239004487952,
"learning_rate": 2.1504315943687114e-07,
"logits/chosen": -1.135602355003357,
"logits/rejected": -1.110710859298706,
"logps/chosen": -410.08441162109375,
"logps/rejected": -693.5352783203125,
"loss": 0.4104,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.8059812784194946,
"rewards/margins": 2.415012836456299,
"rewards/rejected": -4.220993995666504,
"step": 2460
},
{
"epoch": 0.5926103646833013,
"grad_norm": 19.370614206098328,
"learning_rate": 2.1297114303730248e-07,
"logits/chosen": -1.1299896240234375,
"logits/rejected": -1.0449109077453613,
"logps/chosen": -411.85040283203125,
"logps/rejected": -708.7200927734375,
"loss": 0.4912,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.996572494506836,
"rewards/margins": 2.44885516166687,
"rewards/rejected": -4.445427894592285,
"step": 2470
},
{
"epoch": 0.5950095969289827,
"grad_norm": 16.656657262376168,
"learning_rate": 2.1090172406493616e-07,
"logits/chosen": -1.0786526203155518,
"logits/rejected": -1.0406323671340942,
"logps/chosen": -351.46417236328125,
"logps/rejected": -575.1397705078125,
"loss": 0.3825,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.2345200777053833,
"rewards/margins": 2.170752763748169,
"rewards/rejected": -3.4052727222442627,
"step": 2480
},
{
"epoch": 0.5974088291746641,
"grad_norm": 19.934123908947278,
"learning_rate": 2.0883504768129146e-07,
"logits/chosen": -1.242959976196289,
"logits/rejected": -1.2493782043457031,
"logps/chosen": -472.5721130371094,
"logps/rejected": -716.5457763671875,
"loss": 0.431,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.007760763168335,
"rewards/margins": 2.496676206588745,
"rewards/rejected": -4.504437446594238,
"step": 2490
},
{
"epoch": 0.5998080614203455,
"grad_norm": 15.746281719328787,
"learning_rate": 2.0677125885550571e-07,
"logits/chosen": -1.1213797330856323,
"logits/rejected": -1.3027660846710205,
"logps/chosen": -431.63641357421875,
"logps/rejected": -593.3057861328125,
"loss": 0.4323,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.9971039295196533,
"rewards/margins": 1.9194910526275635,
"rewards/rejected": -3.9165947437286377,
"step": 2500
},
{
"epoch": 0.6022072936660269,
"grad_norm": 23.759422926525374,
"learning_rate": 2.0471050235416587e-07,
"logits/chosen": -1.0919904708862305,
"logits/rejected": -1.2857704162597656,
"logps/chosen": -511.79132080078125,
"logps/rejected": -708.4951171875,
"loss": 0.3853,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.4641222953796387,
"rewards/margins": 2.413705348968506,
"rewards/rejected": -4.8778276443481445,
"step": 2510
},
{
"epoch": 0.6046065259117083,
"grad_norm": 25.281497673378116,
"learning_rate": 2.026529227311532e-07,
"logits/chosen": -1.229898452758789,
"logits/rejected": -1.2374264001846313,
"logps/chosen": -431.0499572753906,
"logps/rejected": -689.706298828125,
"loss": 0.468,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.1411490440368652,
"rewards/margins": 2.5051817893981934,
"rewards/rejected": -4.6463303565979,
"step": 2520
},
{
"epoch": 0.6070057581573897,
"grad_norm": 14.693269652927464,
"learning_rate": 2.005986643175036e-07,
"logits/chosen": -1.187809944152832,
"logits/rejected": -1.1687113046646118,
"logps/chosen": -414.11407470703125,
"logps/rejected": -739.9503173828125,
"loss": 0.3685,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.609575867652893,
"rewards/margins": 3.377927780151367,
"rewards/rejected": -4.9875030517578125,
"step": 2530
},
{
"epoch": 0.6094049904030711,
"grad_norm": 18.59133617851427,
"learning_rate": 1.9854787121128328e-07,
"logits/chosen": -1.203471302986145,
"logits/rejected": -1.3611973524093628,
"logps/chosen": -412.1192321777344,
"logps/rejected": -546.4465942382812,
"loss": 0.4775,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.861853837966919,
"rewards/margins": 1.7769527435302734,
"rewards/rejected": -3.6388065814971924,
"step": 2540
},
{
"epoch": 0.6118042226487524,
"grad_norm": 13.920438386133542,
"learning_rate": 1.9650068726748106e-07,
"logits/chosen": -1.1817299127578735,
"logits/rejected": -1.3077366352081299,
"logps/chosen": -465.7212829589844,
"logps/rejected": -663.9733276367188,
"loss": 0.4559,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.955004334449768,
"rewards/margins": 2.0206496715545654,
"rewards/rejected": -3.975654125213623,
"step": 2550
},
{
"epoch": 0.6142034548944337,
"grad_norm": 13.161703470683147,
"learning_rate": 1.9445725608791718e-07,
"logits/chosen": -1.1682353019714355,
"logits/rejected": -1.2265560626983643,
"logps/chosen": -466.87713623046875,
"logps/rejected": -877.6561279296875,
"loss": 0.4242,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.123385190963745,
"rewards/margins": 4.002751350402832,
"rewards/rejected": -6.126136779785156,
"step": 2560
},
{
"epoch": 0.6166026871401151,
"grad_norm": 15.36104260148359,
"learning_rate": 1.924177210111705e-07,
"logits/chosen": -1.2761640548706055,
"logits/rejected": -1.3670276403427124,
"logps/chosen": -424.89581298828125,
"logps/rejected": -750.1470947265625,
"loss": 0.4409,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.8831450939178467,
"rewards/margins": 3.1273887157440186,
"rewards/rejected": -5.010534286499023,
"step": 2570
},
{
"epoch": 0.6190019193857965,
"grad_norm": 10.821617135187617,
"learning_rate": 1.9038222510252364e-07,
"logits/chosen": -1.2255038022994995,
"logits/rejected": -1.249638319015503,
"logps/chosen": -406.5605773925781,
"logps/rejected": -569.8077392578125,
"loss": 0.4144,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.5970432758331299,
"rewards/margins": 1.7124197483062744,
"rewards/rejected": -3.3094630241394043,
"step": 2580
},
{
"epoch": 0.6214011516314779,
"grad_norm": 20.79834881298733,
"learning_rate": 1.883509111439277e-07,
"logits/chosen": -1.2082470655441284,
"logits/rejected": -1.2324997186660767,
"logps/chosen": -405.4540710449219,
"logps/rejected": -772.509765625,
"loss": 0.4118,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.863959550857544,
"rewards/margins": 3.0209383964538574,
"rewards/rejected": -4.8848981857299805,
"step": 2590
},
{
"epoch": 0.6238003838771593,
"grad_norm": 14.594348842540208,
"learning_rate": 1.8632392162398665e-07,
"logits/chosen": -1.1460466384887695,
"logits/rejected": -1.148008108139038,
"logps/chosen": -481.5874938964844,
"logps/rejected": -762.729736328125,
"loss": 0.3784,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -1.904550313949585,
"rewards/margins": 2.8737847805023193,
"rewards/rejected": -4.778334617614746,
"step": 2600
},
{
"epoch": 0.6261996161228407,
"grad_norm": 18.405220413400123,
"learning_rate": 1.84301398727962e-07,
"logits/chosen": -1.2919515371322632,
"logits/rejected": -1.226064682006836,
"logps/chosen": -384.24078369140625,
"logps/rejected": -750.8485107421875,
"loss": 0.4252,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.0034337043762207,
"rewards/margins": 3.35615611076355,
"rewards/rejected": -5.359589576721191,
"step": 2610
},
{
"epoch": 0.6285988483685221,
"grad_norm": 20.54137342064033,
"learning_rate": 1.8228348432779966e-07,
"logits/chosen": -1.2917633056640625,
"logits/rejected": -1.313946008682251,
"logps/chosen": -463.4065856933594,
"logps/rejected": -708.0303955078125,
"loss": 0.4268,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.310145854949951,
"rewards/margins": 2.4632420539855957,
"rewards/rejected": -4.773387432098389,
"step": 2620
},
{
"epoch": 0.6309980806142035,
"grad_norm": 12.212262130926057,
"learning_rate": 1.8027031997217773e-07,
"logits/chosen": -1.3770760297775269,
"logits/rejected": -1.3869761228561401,
"logps/chosen": -496.34136962890625,
"logps/rejected": -1004.2086791992188,
"loss": 0.3796,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.659231662750244,
"rewards/margins": 4.979175090789795,
"rewards/rejected": -7.638407230377197,
"step": 2630
},
{
"epoch": 0.6333973128598849,
"grad_norm": 16.20411120653905,
"learning_rate": 1.7826204687657758e-07,
"logits/chosen": -1.1182453632354736,
"logits/rejected": -1.1282155513763428,
"logps/chosen": -475.62335205078125,
"logps/rejected": -599.8681030273438,
"loss": 0.4083,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9853700399398804,
"rewards/margins": 1.6344798803329468,
"rewards/rejected": -3.619849681854248,
"step": 2640
},
{
"epoch": 0.6357965451055663,
"grad_norm": 22.936528003077683,
"learning_rate": 1.762588059133781e-07,
"logits/chosen": -1.1725223064422607,
"logits/rejected": -1.3193824291229248,
"logps/chosen": -507.3565368652344,
"logps/rejected": -717.995849609375,
"loss": 0.4443,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.1068882942199707,
"rewards/margins": 2.464982509613037,
"rewards/rejected": -4.57187032699585,
"step": 2650
},
{
"epoch": 0.6381957773512476,
"grad_norm": 18.6077043898828,
"learning_rate": 1.7426073760197406e-07,
"logits/chosen": -1.1054435968399048,
"logits/rejected": -1.0801939964294434,
"logps/chosen": -478.20074462890625,
"logps/rejected": -859.6195068359375,
"loss": 0.4259,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.2413601875305176,
"rewards/margins": 3.528954029083252,
"rewards/rejected": -5.7703142166137695,
"step": 2660
},
{
"epoch": 0.6405950095969289,
"grad_norm": 14.701850577255247,
"learning_rate": 1.7226798209891935e-07,
"logits/chosen": -1.1424671411514282,
"logits/rejected": -1.3594285249710083,
"logps/chosen": -486.80755615234375,
"logps/rejected": -663.4140625,
"loss": 0.3759,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.2926859855651855,
"rewards/margins": 2.290337085723877,
"rewards/rejected": -4.583022594451904,
"step": 2670
},
{
"epoch": 0.6429942418426103,
"grad_norm": 23.82001052972649,
"learning_rate": 1.7028067918809535e-07,
"logits/chosen": -1.220568299293518,
"logits/rejected": -1.2531940937042236,
"logps/chosen": -426.9730529785156,
"logps/rejected": -829.8860473632812,
"loss": 0.4311,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.0204787254333496,
"rewards/margins": 3.661693572998047,
"rewards/rejected": -5.6821722984313965,
"step": 2680
},
{
"epoch": 0.6453934740882917,
"grad_norm": 20.92197517678509,
"learning_rate": 1.6829896827090584e-07,
"logits/chosen": -1.3699915409088135,
"logits/rejected": -1.415359377861023,
"logps/chosen": -504.123291015625,
"logps/rejected": -585.438232421875,
"loss": 0.4529,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.390109062194824,
"rewards/margins": 1.1103547811508179,
"rewards/rejected": -3.5004639625549316,
"step": 2690
},
{
"epoch": 0.6477927063339731,
"grad_norm": 11.90165561763133,
"learning_rate": 1.6632298835649844e-07,
"logits/chosen": -1.2418677806854248,
"logits/rejected": -1.2179887294769287,
"logps/chosen": -498.97607421875,
"logps/rejected": -764.77587890625,
"loss": 0.3979,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.3297247886657715,
"rewards/margins": 2.4172799587249756,
"rewards/rejected": -4.747004508972168,
"step": 2700
},
{
"epoch": 0.6501919385796545,
"grad_norm": 13.166436756002133,
"learning_rate": 1.6435287805201364e-07,
"logits/chosen": -1.3559385538101196,
"logits/rejected": -1.3355977535247803,
"logps/chosen": -489.180419921875,
"logps/rejected": -652.1635131835938,
"loss": 0.4155,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.2472357749938965,
"rewards/margins": 1.647270917892456,
"rewards/rejected": -3.8945069313049316,
"step": 2710
},
{
"epoch": 0.6525911708253359,
"grad_norm": 19.18769450791739,
"learning_rate": 1.6238877555286207e-07,
"logits/chosen": -1.3151136636734009,
"logits/rejected": -1.3525760173797607,
"logps/chosen": -442.0126953125,
"logps/rejected": -712.2786865234375,
"loss": 0.3659,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.7712196111679077,
"rewards/margins": 2.6433427333831787,
"rewards/rejected": -4.414562702178955,
"step": 2720
},
{
"epoch": 0.6549904030710173,
"grad_norm": 16.006816349741847,
"learning_rate": 1.60430818633031e-07,
"logits/chosen": -1.1541482210159302,
"logits/rejected": -1.1810917854309082,
"logps/chosen": -428.27117919921875,
"logps/rejected": -661.9808959960938,
"loss": 0.3728,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.8128198385238647,
"rewards/margins": 2.3960225582122803,
"rewards/rejected": -4.2088422775268555,
"step": 2730
},
{
"epoch": 0.6573896353166987,
"grad_norm": 15.843743360837514,
"learning_rate": 1.5847914463541939e-07,
"logits/chosen": -1.2659448385238647,
"logits/rejected": -1.3333221673965454,
"logps/chosen": -380.4425354003906,
"logps/rejected": -661.7493896484375,
"loss": 0.3829,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.8144347667694092,
"rewards/margins": 2.581631898880005,
"rewards/rejected": -4.396066665649414,
"step": 2740
},
{
"epoch": 0.6597888675623801,
"grad_norm": 12.902781532875105,
"learning_rate": 1.5653389046220427e-07,
"logits/chosen": -1.202580451965332,
"logits/rejected": -1.2428548336029053,
"logps/chosen": -393.21392822265625,
"logps/rejected": -585.6798095703125,
"loss": 0.4472,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5873596668243408,
"rewards/margins": 1.7750890254974365,
"rewards/rejected": -3.3624484539031982,
"step": 2750
},
{
"epoch": 0.6621880998080614,
"grad_norm": 15.497438468614629,
"learning_rate": 1.545951925652375e-07,
"logits/chosen": -1.1852418184280396,
"logits/rejected": -1.3214651346206665,
"logps/chosen": -513.821044921875,
"logps/rejected": -740.9768676757812,
"loss": 0.4159,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.0691330432891846,
"rewards/margins": 2.799996852874756,
"rewards/rejected": -4.8691301345825195,
"step": 2760
},
{
"epoch": 0.6645873320537428,
"grad_norm": 25.023616546810636,
"learning_rate": 1.5266318693647423e-07,
"logits/chosen": -1.2369322776794434,
"logits/rejected": -1.2678784132003784,
"logps/chosen": -495.04425048828125,
"logps/rejected": -637.2513427734375,
"loss": 0.4159,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.1835405826568604,
"rewards/margins": 1.4997859001159668,
"rewards/rejected": -3.6833267211914062,
"step": 2770
},
{
"epoch": 0.6669865642994242,
"grad_norm": 13.945033201288798,
"learning_rate": 1.5073800909843353e-07,
"logits/chosen": -1.1865065097808838,
"logits/rejected": -1.3370563983917236,
"logps/chosen": -465.9248046875,
"logps/rejected": -663.2384033203125,
"loss": 0.4064,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.0080044269561768,
"rewards/margins": 2.541308879852295,
"rewards/rejected": -4.549313545227051,
"step": 2780
},
{
"epoch": 0.6693857965451055,
"grad_norm": 16.335549570588938,
"learning_rate": 1.488197940946922e-07,
"logits/chosen": -1.090932846069336,
"logits/rejected": -1.1426491737365723,
"logps/chosen": -454.6915588378906,
"logps/rejected": -645.7820434570312,
"loss": 0.3909,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.7976535558700562,
"rewards/margins": 2.447206735610962,
"rewards/rejected": -4.244860649108887,
"step": 2790
},
{
"epoch": 0.6717850287907869,
"grad_norm": 19.689368835083336,
"learning_rate": 1.4690867648041167e-07,
"logits/chosen": -1.0205479860305786,
"logits/rejected": -1.2015823125839233,
"logps/chosen": -444.68115234375,
"logps/rejected": -664.7821044921875,
"loss": 0.4133,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8803367614746094,
"rewards/margins": 2.4233222007751465,
"rewards/rejected": -4.303658485412598,
"step": 2800
},
{
"epoch": 0.6741842610364683,
"grad_norm": 15.922653221989023,
"learning_rate": 1.4500479031289987e-07,
"logits/chosen": -1.121628999710083,
"logits/rejected": -1.2732969522476196,
"logps/chosen": -426.40631103515625,
"logps/rejected": -628.9437255859375,
"loss": 0.4774,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.6420685052871704,
"rewards/margins": 2.0467495918273926,
"rewards/rejected": -3.6888179779052734,
"step": 2810
},
{
"epoch": 0.6765834932821497,
"grad_norm": 10.688957500109868,
"learning_rate": 1.4310826914220747e-07,
"logits/chosen": -1.1144278049468994,
"logits/rejected": -1.2004985809326172,
"logps/chosen": -465.1630859375,
"logps/rejected": -635.3046264648438,
"loss": 0.4528,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7992807626724243,
"rewards/margins": 1.7720779180526733,
"rewards/rejected": -3.5713589191436768,
"step": 2820
},
{
"epoch": 0.6789827255278311,
"grad_norm": 11.408686793770782,
"learning_rate": 1.412192460017597e-07,
"logits/chosen": -1.2038311958312988,
"logits/rejected": -1.162626028060913,
"logps/chosen": -476.5159606933594,
"logps/rejected": -705.89990234375,
"loss": 0.4191,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.387838840484619,
"rewards/margins": 2.2411282062530518,
"rewards/rejected": -4.628966331481934,
"step": 2830
},
{
"epoch": 0.6813819577735125,
"grad_norm": 12.118691511541517,
"learning_rate": 1.3933785339902504e-07,
"logits/chosen": -1.2565038204193115,
"logits/rejected": -1.167474389076233,
"logps/chosen": -400.8456115722656,
"logps/rejected": -641.7380981445312,
"loss": 0.4456,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.9326709508895874,
"rewards/margins": 1.984291672706604,
"rewards/rejected": -3.9169623851776123,
"step": 2840
},
{
"epoch": 0.6837811900191939,
"grad_norm": 13.932235400427288,
"learning_rate": 1.374642233062197e-07,
"logits/chosen": -1.1590187549591064,
"logits/rejected": -1.3027136325836182,
"logps/chosen": -474.519287109375,
"logps/rejected": -681.8683471679688,
"loss": 0.4318,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.970078468322754,
"rewards/margins": 2.387824535369873,
"rewards/rejected": -4.357902526855469,
"step": 2850
},
{
"epoch": 0.6861804222648752,
"grad_norm": 16.723737190217008,
"learning_rate": 1.355984871510511e-07,
"logits/chosen": -1.1410120725631714,
"logits/rejected": -1.1267549991607666,
"logps/chosen": -505.823974609375,
"logps/rejected": -715.5396118164062,
"loss": 0.3923,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.1921565532684326,
"rewards/margins": 2.031604290008545,
"rewards/rejected": -4.223761081695557,
"step": 2860
},
{
"epoch": 0.6885796545105566,
"grad_norm": 21.50806215898013,
"learning_rate": 1.3374077580749783e-07,
"logits/chosen": -1.3120447397232056,
"logits/rejected": -1.281110405921936,
"logps/chosen": -368.84185791015625,
"logps/rejected": -601.4373168945312,
"loss": 0.4163,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.7995599508285522,
"rewards/margins": 2.158607006072998,
"rewards/rejected": -3.958167314529419,
"step": 2870
},
{
"epoch": 0.690978886756238,
"grad_norm": 27.638375107613143,
"learning_rate": 1.3189121958663024e-07,
"logits/chosen": -1.1347416639328003,
"logits/rejected": -1.3342390060424805,
"logps/chosen": -551.9705810546875,
"logps/rejected": -637.0390625,
"loss": 0.4529,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.7998149394989014,
"rewards/margins": 1.0880420207977295,
"rewards/rejected": -3.8878567218780518,
"step": 2880
},
{
"epoch": 0.6933781190019194,
"grad_norm": 14.692498350185678,
"learning_rate": 1.3004994822746895e-07,
"logits/chosen": -1.2893893718719482,
"logits/rejected": -1.3418468236923218,
"logps/chosen": -442.1363220214844,
"logps/rejected": -626.1585083007812,
"loss": 0.4302,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9635156393051147,
"rewards/margins": 1.7742655277252197,
"rewards/rejected": -3.737781047821045,
"step": 2890
},
{
"epoch": 0.6957773512476008,
"grad_norm": 13.677081786503933,
"learning_rate": 1.2821709088788434e-07,
"logits/chosen": -1.0876823663711548,
"logits/rejected": -1.1584521532058716,
"logps/chosen": -408.7326965332031,
"logps/rejected": -630.5333862304688,
"loss": 0.4069,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.062255620956421,
"rewards/margins": 2.1946072578430176,
"rewards/rejected": -4.256862163543701,
"step": 2900
},
{
"epoch": 0.6981765834932822,
"grad_norm": 15.872787924761283,
"learning_rate": 1.2639277613553736e-07,
"logits/chosen": -1.3518760204315186,
"logits/rejected": -1.3280441761016846,
"logps/chosen": -398.5044860839844,
"logps/rejected": -600.9732666015625,
"loss": 0.4195,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.9100825786590576,
"rewards/margins": 2.0270023345947266,
"rewards/rejected": -3.9370853900909424,
"step": 2910
},
{
"epoch": 0.7005758157389635,
"grad_norm": 13.346767559623201,
"learning_rate": 1.2457713193885975e-07,
"logits/chosen": -1.1873770952224731,
"logits/rejected": -1.1855942010879517,
"logps/chosen": -413.6412048339844,
"logps/rejected": -724.1486206054688,
"loss": 0.3688,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.3553316593170166,
"rewards/margins": 2.758777379989624,
"rewards/rejected": -5.114109516143799,
"step": 2920
},
{
"epoch": 0.7029750479846449,
"grad_norm": 22.448762027485316,
"learning_rate": 1.2277028565807838e-07,
"logits/chosen": -1.2834995985031128,
"logits/rejected": -1.3559261560440063,
"logps/chosen": -453.78985595703125,
"logps/rejected": -671.7575073242188,
"loss": 0.4267,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.049879550933838,
"rewards/margins": 2.2892394065856934,
"rewards/rejected": -4.339118957519531,
"step": 2930
},
{
"epoch": 0.7053742802303263,
"grad_norm": 16.129648043941792,
"learning_rate": 1.209723640362815e-07,
"logits/chosen": -1.1959068775177002,
"logits/rejected": -1.2405879497528076,
"logps/chosen": -505.00726318359375,
"logps/rejected": -860.0382690429688,
"loss": 0.4607,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.3982629776000977,
"rewards/margins": 3.6283717155456543,
"rewards/rejected": -6.026634693145752,
"step": 2940
},
{
"epoch": 0.7077735124760077,
"grad_norm": 14.433857430526624,
"learning_rate": 1.191834931905277e-07,
"logits/chosen": -1.1443761587142944,
"logits/rejected": -1.1771003007888794,
"logps/chosen": -516.5400390625,
"logps/rejected": -709.7727661132812,
"loss": 0.4379,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.209808826446533,
"rewards/margins": 1.9394118785858154,
"rewards/rejected": -4.1492204666137695,
"step": 2950
},
{
"epoch": 0.710172744721689,
"grad_norm": 14.063698311506704,
"learning_rate": 1.1740379860299988e-07,
"logits/chosen": -1.1812469959259033,
"logits/rejected": -1.1932779550552368,
"logps/chosen": -486.75848388671875,
"logps/rejected": -688.697509765625,
"loss": 0.4358,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.0185728073120117,
"rewards/margins": 1.8188155889511108,
"rewards/rejected": -3.837387800216675,
"step": 2960
},
{
"epoch": 0.7125719769673704,
"grad_norm": 12.25394441206106,
"learning_rate": 1.1563340511220254e-07,
"logits/chosen": -1.1238670349121094,
"logits/rejected": -1.2288951873779297,
"logps/chosen": -479.03082275390625,
"logps/rejected": -721.0484008789062,
"loss": 0.4231,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9672508239746094,
"rewards/margins": 2.5986568927764893,
"rewards/rejected": -4.5659074783325195,
"step": 2970
},
{
"epoch": 0.7149712092130518,
"grad_norm": 14.233686065176078,
"learning_rate": 1.1387243690420556e-07,
"logits/chosen": -1.1306841373443604,
"logits/rejected": -1.2058961391448975,
"logps/chosen": -530.5966796875,
"logps/rejected": -791.8387451171875,
"loss": 0.4661,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.097524881362915,
"rewards/margins": 2.7319130897521973,
"rewards/rejected": -4.829438209533691,
"step": 2980
},
{
"epoch": 0.7173704414587332,
"grad_norm": 19.693131800311516,
"learning_rate": 1.1212101750393235e-07,
"logits/chosen": -1.255438208580017,
"logits/rejected": -1.3434498310089111,
"logps/chosen": -474.38916015625,
"logps/rejected": -754.1934814453125,
"loss": 0.4109,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.2069497108459473,
"rewards/margins": 2.954502582550049,
"rewards/rejected": -5.161452293395996,
"step": 2990
},
{
"epoch": 0.7197696737044146,
"grad_norm": 16.97782338199475,
"learning_rate": 1.1037926976649562e-07,
"logits/chosen": -1.1937129497528076,
"logits/rejected": -1.2495759725570679,
"logps/chosen": -488.2049865722656,
"logps/rejected": -799.8907470703125,
"loss": 0.4527,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.2300727367401123,
"rewards/margins": 2.9182958602905273,
"rewards/rejected": -5.148368835449219,
"step": 3000
},
{
"epoch": 0.722168905950096,
"grad_norm": 18.393491636386905,
"learning_rate": 1.0864731586857936e-07,
"logits/chosen": -1.1326544284820557,
"logits/rejected": -1.2836530208587646,
"logps/chosen": -472.63739013671875,
"logps/rejected": -724.0669555664062,
"loss": 0.3849,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.8869655132293701,
"rewards/margins": 2.8039302825927734,
"rewards/rejected": -4.6908955574035645,
"step": 3010
},
{
"epoch": 0.7245681381957774,
"grad_norm": 20.305303573951132,
"learning_rate": 1.0692527729986839e-07,
"logits/chosen": -1.117078423500061,
"logits/rejected": -1.2159783840179443,
"logps/chosen": -495.0431213378906,
"logps/rejected": -729.7164306640625,
"loss": 0.3795,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.3929858207702637,
"rewards/margins": 2.559861660003662,
"rewards/rejected": -4.952847003936768,
"step": 3020
},
{
"epoch": 0.7269673704414588,
"grad_norm": 18.149863459319363,
"learning_rate": 1.0521327485452692e-07,
"logits/chosen": -1.2258936166763306,
"logits/rejected": -1.2951385974884033,
"logps/chosen": -507.068359375,
"logps/rejected": -806.07421875,
"loss": 0.4145,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.737776279449463,
"rewards/margins": 3.1096084117889404,
"rewards/rejected": -5.847384452819824,
"step": 3030
},
{
"epoch": 0.7293666026871402,
"grad_norm": 22.723618390263812,
"learning_rate": 1.0351142862272468e-07,
"logits/chosen": -1.1216206550598145,
"logits/rejected": -1.2867326736450195,
"logps/chosen": -473.12408447265625,
"logps/rejected": -896.2477416992188,
"loss": 0.4208,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.6412878036499023,
"rewards/margins": 4.341336727142334,
"rewards/rejected": -6.982624053955078,
"step": 3040
},
{
"epoch": 0.7317658349328215,
"grad_norm": 19.527492956725595,
"learning_rate": 1.0181985798221343e-07,
"logits/chosen": -1.1790878772735596,
"logits/rejected": -1.1905752420425415,
"logps/chosen": -470.6045837402344,
"logps/rejected": -782.1622924804688,
"loss": 0.4267,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.268146514892578,
"rewards/margins": 3.100985527038574,
"rewards/rejected": -5.369132041931152,
"step": 3050
},
{
"epoch": 0.7341650671785028,
"grad_norm": 17.486447949626083,
"learning_rate": 1.0013868158995329e-07,
"logits/chosen": -1.1854205131530762,
"logits/rejected": -1.268317461013794,
"logps/chosen": -508.628173828125,
"logps/rejected": -689.4181518554688,
"loss": 0.4241,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.384272575378418,
"rewards/margins": 2.0019237995147705,
"rewards/rejected": -4.386197090148926,
"step": 3060
},
{
"epoch": 0.7365642994241842,
"grad_norm": 11.876302215183998,
"learning_rate": 9.84680173737887e-08,
"logits/chosen": -1.2807575464248657,
"logits/rejected": -1.3796226978302002,
"logps/chosen": -456.07244873046875,
"logps/rejected": -617.9744873046875,
"loss": 0.4315,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.917088508605957,
"rewards/margins": 2.016084909439087,
"rewards/rejected": -3.933173418045044,
"step": 3070
},
{
"epoch": 0.7389635316698656,
"grad_norm": 13.157616210751735,
"learning_rate": 9.680798252417713e-08,
"logits/chosen": -1.3769404888153076,
"logits/rejected": -1.4445879459381104,
"logps/chosen": -420.9883728027344,
"logps/rejected": -629.974853515625,
"loss": 0.4059,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.0882909297943115,
"rewards/margins": 1.72428297996521,
"rewards/rejected": -3.8125743865966797,
"step": 3080
},
{
"epoch": 0.741362763915547,
"grad_norm": 15.730225571388548,
"learning_rate": 9.515869348596808e-08,
"logits/chosen": -1.1365947723388672,
"logits/rejected": -1.275075912475586,
"logps/chosen": -472.18408203125,
"logps/rejected": -672.9304809570312,
"loss": 0.4284,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.8900762796401978,
"rewards/margins": 2.209829807281494,
"rewards/rejected": -4.099905967712402,
"step": 3090
},
{
"epoch": 0.7437619961612284,
"grad_norm": 35.145335130920884,
"learning_rate": 9.352026595023493e-08,
"logits/chosen": -1.1994316577911377,
"logits/rejected": -1.2499196529388428,
"logps/chosen": -472.410888671875,
"logps/rejected": -594.9708251953125,
"loss": 0.4289,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9588143825531006,
"rewards/margins": 1.4960181713104248,
"rewards/rejected": -3.4548325538635254,
"step": 3100
},
{
"epoch": 0.7461612284069098,
"grad_norm": 15.811033639076967,
"learning_rate": 9.189281484616004e-08,
"logits/chosen": -1.2232351303100586,
"logits/rejected": -1.2345741987228394,
"logps/chosen": -410.38848876953125,
"logps/rejected": -693.65673828125,
"loss": 0.4377,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.075925588607788,
"rewards/margins": 2.4454538822174072,
"rewards/rejected": -4.521379470825195,
"step": 3110
},
{
"epoch": 0.7485604606525912,
"grad_norm": 22.73378264594113,
"learning_rate": 9.027645433297249e-08,
"logits/chosen": -1.103428602218628,
"logits/rejected": -1.1875020265579224,
"logps/chosen": -571.9773559570312,
"logps/rejected": -778.9340209960938,
"loss": 0.4763,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.662141799926758,
"rewards/margins": 2.4210867881774902,
"rewards/rejected": -5.083228588104248,
"step": 3120
},
{
"epoch": 0.7509596928982726,
"grad_norm": 25.696268435535774,
"learning_rate": 8.867129779194066e-08,
"logits/chosen": -1.244616150856018,
"logits/rejected": -1.353212594985962,
"logps/chosen": -394.8204040527344,
"logps/rejected": -692.91357421875,
"loss": 0.4442,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.8012546300888062,
"rewards/margins": 2.963744640350342,
"rewards/rejected": -4.764999866485596,
"step": 3130
},
{
"epoch": 0.753358925143954,
"grad_norm": 18.050945946975368,
"learning_rate": 8.707745781841866e-08,
"logits/chosen": -1.1310231685638428,
"logits/rejected": -1.2564256191253662,
"logps/chosen": -449.65362548828125,
"logps/rejected": -771.5709838867188,
"loss": 0.4261,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.2246077060699463,
"rewards/margins": 3.2700328826904297,
"rewards/rejected": -5.494640350341797,
"step": 3140
},
{
"epoch": 0.7557581573896354,
"grad_norm": 9.799407812333103,
"learning_rate": 8.549504621394831e-08,
"logits/chosen": -1.2643756866455078,
"logits/rejected": -1.2907123565673828,
"logps/chosen": -408.44683837890625,
"logps/rejected": -748.5787353515625,
"loss": 0.3526,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.775541067123413,
"rewards/margins": 3.4335663318634033,
"rewards/rejected": -5.209107398986816,
"step": 3150
},
{
"epoch": 0.7581573896353166,
"grad_norm": 19.977109925350412,
"learning_rate": 8.392417397841703e-08,
"logits/chosen": -1.216590166091919,
"logits/rejected": -1.3006147146224976,
"logps/chosen": -467.8597717285156,
"logps/rejected": -650.8781127929688,
"loss": 0.4394,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.0549581050872803,
"rewards/margins": 1.685307502746582,
"rewards/rejected": -3.7402656078338623,
"step": 3160
},
{
"epoch": 0.760556621880998,
"grad_norm": 17.465911639708498,
"learning_rate": 8.236495130227083e-08,
"logits/chosen": -1.2152674198150635,
"logits/rejected": -1.3853117227554321,
"logps/chosen": -531.36865234375,
"logps/rejected": -838.9884033203125,
"loss": 0.4623,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.4217429161071777,
"rewards/margins": 3.4808337688446045,
"rewards/rejected": -5.9025774002075195,
"step": 3170
},
{
"epoch": 0.7629558541266794,
"grad_norm": 18.484754479900506,
"learning_rate": 8.081748755878612e-08,
"logits/chosen": -1.2535191774368286,
"logits/rejected": -1.3943572044372559,
"logps/chosen": -503.76898193359375,
"logps/rejected": -699.9238891601562,
"loss": 0.4074,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.2457404136657715,
"rewards/margins": 2.4922969341278076,
"rewards/rejected": -4.738037109375,
"step": 3180
},
{
"epoch": 0.7653550863723608,
"grad_norm": 13.480118265934307,
"learning_rate": 7.928189129639632e-08,
"logits/chosen": -1.1623866558074951,
"logits/rejected": -1.1990493535995483,
"logps/chosen": -440.8330993652344,
"logps/rejected": -671.6934814453125,
"loss": 0.4079,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.2158052921295166,
"rewards/margins": 2.1327455043792725,
"rewards/rejected": -4.348550796508789,
"step": 3190
},
{
"epoch": 0.7677543186180422,
"grad_norm": 18.218781531194935,
"learning_rate": 7.775827023107834e-08,
"logits/chosen": -1.2353599071502686,
"logits/rejected": -1.2978723049163818,
"logps/chosen": -485.01837158203125,
"logps/rejected": -695.3224487304688,
"loss": 0.4193,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.558899164199829,
"rewards/margins": 1.8993895053863525,
"rewards/rejected": -4.458288669586182,
"step": 3200
},
{
"epoch": 0.7701535508637236,
"grad_norm": 23.87409903985215,
"learning_rate": 7.624673123879682e-08,
"logits/chosen": -1.1395564079284668,
"logits/rejected": -1.2785004377365112,
"logps/chosen": -447.79144287109375,
"logps/rejected": -616.0286254882812,
"loss": 0.435,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.080073356628418,
"rewards/margins": 1.8376333713531494,
"rewards/rejected": -3.917706251144409,
"step": 3210
},
{
"epoch": 0.772552783109405,
"grad_norm": 20.86151843851289,
"learning_rate": 7.474738034800663e-08,
"logits/chosen": -1.2566897869110107,
"logits/rejected": -1.2571423053741455,
"logps/chosen": -419.4219665527344,
"logps/rejected": -851.1385498046875,
"loss": 0.4659,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.008873701095581,
"rewards/margins": 4.522359371185303,
"rewards/rejected": -6.5312323570251465,
"step": 3220
},
{
"epoch": 0.7749520153550864,
"grad_norm": 13.169040672352676,
"learning_rate": 7.326032273221606e-08,
"logits/chosen": -1.3603243827819824,
"logits/rejected": -1.3747715950012207,
"logps/chosen": -500.2078552246094,
"logps/rejected": -785.7582397460938,
"loss": 0.4021,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.25557279586792,
"rewards/margins": 3.053633213043213,
"rewards/rejected": -5.309205532073975,
"step": 3230
},
{
"epoch": 0.7773512476007678,
"grad_norm": 16.05314488934455,
"learning_rate": 7.178566270260872e-08,
"logits/chosen": -1.318904995918274,
"logits/rejected": -1.3817777633666992,
"logps/chosen": -521.3937377929688,
"logps/rejected": -800.31103515625,
"loss": 0.4506,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.6105079650878906,
"rewards/margins": 2.589186668395996,
"rewards/rejected": -5.199694633483887,
"step": 3240
},
{
"epoch": 0.7797504798464492,
"grad_norm": 14.261695457548557,
"learning_rate": 7.032350370072709e-08,
"logits/chosen": -1.2088521718978882,
"logits/rejected": -1.2967567443847656,
"logps/chosen": -463.45794677734375,
"logps/rejected": -668.6795654296875,
"loss": 0.4028,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.936281442642212,
"rewards/margins": 2.147418975830078,
"rewards/rejected": -4.083700180053711,
"step": 3250
},
{
"epoch": 0.7821497120921305,
"grad_norm": 12.79782870680645,
"learning_rate": 6.887394829121596e-08,
"logits/chosen": -1.2623844146728516,
"logits/rejected": -1.3965818881988525,
"logps/chosen": -520.1638793945312,
"logps/rejected": -901.8109130859375,
"loss": 0.3906,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.5400707721710205,
"rewards/margins": 4.0481061935424805,
"rewards/rejected": -6.588177680969238,
"step": 3260
},
{
"epoch": 0.7845489443378119,
"grad_norm": 16.59225411377178,
"learning_rate": 6.743709815462833e-08,
"logits/chosen": -1.2642148733139038,
"logits/rejected": -1.3561595678329468,
"logps/chosen": -521.087890625,
"logps/rejected": -813.130615234375,
"loss": 0.4099,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.508223056793213,
"rewards/margins": 3.2700836658477783,
"rewards/rejected": -5.7783074378967285,
"step": 3270
},
{
"epoch": 0.7869481765834933,
"grad_norm": 14.993047424273575,
"learning_rate": 6.601305408029287e-08,
"logits/chosen": -1.2957048416137695,
"logits/rejected": -1.418428659439087,
"logps/chosen": -458.10076904296875,
"logps/rejected": -786.9931640625,
"loss": 0.3831,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.213374137878418,
"rewards/margins": 3.280181407928467,
"rewards/rejected": -5.493556022644043,
"step": 3280
},
{
"epoch": 0.7893474088291746,
"grad_norm": 15.085926001703353,
"learning_rate": 6.460191595924366e-08,
"logits/chosen": -1.2087162733078003,
"logits/rejected": -1.2654608488082886,
"logps/chosen": -449.6219177246094,
"logps/rejected": -705.5459594726562,
"loss": 0.385,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.1014938354492188,
"rewards/margins": 2.5265586376190186,
"rewards/rejected": -4.628052711486816,
"step": 3290
},
{
"epoch": 0.791746641074856,
"grad_norm": 13.169922978355546,
"learning_rate": 6.320378277721342e-08,
"logits/chosen": -1.3274714946746826,
"logits/rejected": -1.3772881031036377,
"logps/chosen": -465.1910095214844,
"logps/rejected": -624.9302368164062,
"loss": 0.4013,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.1250336170196533,
"rewards/margins": 1.715985894203186,
"rewards/rejected": -3.841019868850708,
"step": 3300
},
{
"epoch": 0.7941458733205374,
"grad_norm": 27.82691688189915,
"learning_rate": 6.181875260769032e-08,
"logits/chosen": -1.2468100786209106,
"logits/rejected": -1.4290482997894287,
"logps/chosen": -521.2555541992188,
"logps/rejected": -781.6026000976562,
"loss": 0.416,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.3911871910095215,
"rewards/margins": 3.2505409717559814,
"rewards/rejected": -5.641728401184082,
"step": 3310
},
{
"epoch": 0.7965451055662188,
"grad_norm": 15.737223970644676,
"learning_rate": 6.044692260503797e-08,
"logits/chosen": -1.1637942790985107,
"logits/rejected": -1.2922910451889038,
"logps/chosen": -529.7451171875,
"logps/rejected": -853.4421997070312,
"loss": 0.3675,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.4741718769073486,
"rewards/margins": 3.4323413372039795,
"rewards/rejected": -5.906513214111328,
"step": 3320
},
{
"epoch": 0.7989443378119002,
"grad_norm": 13.061674320090848,
"learning_rate": 5.9088388997680984e-08,
"logits/chosen": -1.1912027597427368,
"logits/rejected": -1.3324553966522217,
"logps/chosen": -545.3333740234375,
"logps/rejected": -860.1539306640625,
"loss": 0.394,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -2.197711944580078,
"rewards/margins": 3.962009906768799,
"rewards/rejected": -6.159722328186035,
"step": 3330
},
{
"epoch": 0.8013435700575816,
"grad_norm": 19.85934874401157,
"learning_rate": 5.774324708135439e-08,
"logits/chosen": -1.3464608192443848,
"logits/rejected": -1.4455270767211914,
"logps/chosen": -404.1613464355469,
"logps/rejected": -649.46044921875,
"loss": 0.4296,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.9079539775848389,
"rewards/margins": 2.655972480773926,
"rewards/rejected": -4.563926696777344,
"step": 3340
},
{
"epoch": 0.803742802303263,
"grad_norm": 10.622344250001166,
"learning_rate": 5.641159121241953e-08,
"logits/chosen": -1.340012788772583,
"logits/rejected": -1.3097373247146606,
"logps/chosen": -495.234375,
"logps/rejected": -826.9255981445312,
"loss": 0.4075,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.605710744857788,
"rewards/margins": 3.028745174407959,
"rewards/rejected": -5.634456157684326,
"step": 3350
},
{
"epoch": 0.8061420345489443,
"grad_norm": 15.132572988644833,
"learning_rate": 5.5093514801245106e-08,
"logits/chosen": -1.2543226480484009,
"logits/rejected": -1.3029847145080566,
"logps/chosen": -481.427734375,
"logps/rejected": -770.0972900390625,
"loss": 0.4025,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.3580222129821777,
"rewards/margins": 2.690171241760254,
"rewards/rejected": -5.04819393157959,
"step": 3360
},
{
"epoch": 0.8085412667946257,
"grad_norm": 14.038257828539548,
"learning_rate": 5.378911030565453e-08,
"logits/chosen": -1.1603384017944336,
"logits/rejected": -1.213220238685608,
"logps/chosen": -554.8784790039062,
"logps/rejected": -800.203369140625,
"loss": 0.4264,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.7545363903045654,
"rewards/margins": 2.177790403366089,
"rewards/rejected": -4.932326793670654,
"step": 3370
},
{
"epoch": 0.8109404990403071,
"grad_norm": 14.786725967482855,
"learning_rate": 5.249846922444101e-08,
"logits/chosen": -1.3400354385375977,
"logits/rejected": -1.4116084575653076,
"logps/chosen": -469.63165283203125,
"logps/rejected": -946.8123779296875,
"loss": 0.3953,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.487299680709839,
"rewards/margins": 4.986563682556152,
"rewards/rejected": -7.473863124847412,
"step": 3380
},
{
"epoch": 0.8133397312859885,
"grad_norm": 21.495135238551697,
"learning_rate": 5.122168209094865e-08,
"logits/chosen": -1.2585008144378662,
"logits/rejected": -1.3433778285980225,
"logps/chosen": -421.2335510253906,
"logps/rejected": -548.8929443359375,
"loss": 0.4028,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.097749948501587,
"rewards/margins": 1.2572873830795288,
"rewards/rejected": -3.355037212371826,
"step": 3390
},
{
"epoch": 0.8157389635316699,
"grad_norm": 15.584784008274491,
"learning_rate": 4.995883846672222e-08,
"logits/chosen": -1.1034057140350342,
"logits/rejected": -1.2848924398422241,
"logps/chosen": -590.9141845703125,
"logps/rejected": -737.7780151367188,
"loss": 0.4146,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.305182933807373,
"rewards/margins": 2.196429491043091,
"rewards/rejected": -4.501612663269043,
"step": 3400
},
{
"epoch": 0.8181381957773513,
"grad_norm": 12.577138913457466,
"learning_rate": 4.871002693522486e-08,
"logits/chosen": -1.2216746807098389,
"logits/rejected": -1.2335078716278076,
"logps/chosen": -490.0227966308594,
"logps/rejected": -685.375732421875,
"loss": 0.4083,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.2040867805480957,
"rewards/margins": 2.2901692390441895,
"rewards/rejected": -4.494256496429443,
"step": 3410
},
{
"epoch": 0.8205374280230326,
"grad_norm": 12.780423712977628,
"learning_rate": 4.7475335095623956e-08,
"logits/chosen": -1.2985970973968506,
"logits/rejected": -1.2781016826629639,
"logps/chosen": -527.1070556640625,
"logps/rejected": -797.3029174804688,
"loss": 0.4308,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.7410807609558105,
"rewards/margins": 2.8199234008789062,
"rewards/rejected": -5.561004638671875,
"step": 3420
},
{
"epoch": 0.822936660268714,
"grad_norm": 25.453128411840094,
"learning_rate": 4.6254849556646714e-08,
"logits/chosen": -1.1118319034576416,
"logits/rejected": -1.207648515701294,
"logps/chosen": -549.860595703125,
"logps/rejected": -970.7677612304688,
"loss": 0.4555,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.535156488418579,
"rewards/margins": 4.580714702606201,
"rewards/rejected": -7.115871429443359,
"step": 3430
},
{
"epoch": 0.8253358925143954,
"grad_norm": 14.697372336337441,
"learning_rate": 4.504865593050483e-08,
"logits/chosen": -1.2074692249298096,
"logits/rejected": -1.22970449924469,
"logps/chosen": -493.60443115234375,
"logps/rejected": -733.1605224609375,
"loss": 0.4486,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.2459022998809814,
"rewards/margins": 2.387206792831421,
"rewards/rejected": -4.633109092712402,
"step": 3440
},
{
"epoch": 0.8277351247600768,
"grad_norm": 19.145579102229355,
"learning_rate": 4.385683882688895e-08,
"logits/chosen": -1.0796090364456177,
"logits/rejected": -1.2019519805908203,
"logps/chosen": -480.3756408691406,
"logps/rejected": -558.4412841796875,
"loss": 0.4614,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.0972506999969482,
"rewards/margins": 1.18562912940979,
"rewards/rejected": -3.282879590988159,
"step": 3450
},
{
"epoch": 0.8301343570057581,
"grad_norm": 17.204779447658726,
"learning_rate": 4.2679481847033985e-08,
"logits/chosen": -1.2189116477966309,
"logits/rejected": -1.2864820957183838,
"logps/chosen": -446.82867431640625,
"logps/rejected": -702.0382080078125,
"loss": 0.4466,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.8683035373687744,
"rewards/margins": 2.547211170196533,
"rewards/rejected": -4.415513515472412,
"step": 3460
},
{
"epoch": 0.8325335892514395,
"grad_norm": 14.516749938972369,
"learning_rate": 4.151666757785435e-08,
"logits/chosen": -1.1446878910064697,
"logits/rejected": -1.2078421115875244,
"logps/chosen": -391.42791748046875,
"logps/rejected": -725.2740478515625,
"loss": 0.397,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.532698631286621,
"rewards/margins": 3.396210193634033,
"rewards/rejected": -4.928908348083496,
"step": 3470
},
{
"epoch": 0.8349328214971209,
"grad_norm": 13.196683844151892,
"learning_rate": 4.036847758615136e-08,
"logits/chosen": -1.091759443283081,
"logits/rejected": -1.2269001007080078,
"logps/chosen": -532.867431640625,
"logps/rejected": -730.4796142578125,
"loss": 0.4464,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.9313502311706543,
"rewards/margins": 1.9276530742645264,
"rewards/rejected": -4.859004020690918,
"step": 3480
},
{
"epoch": 0.8373320537428023,
"grad_norm": 12.89862131476654,
"learning_rate": 3.923499241289113e-08,
"logits/chosen": -1.1648555994033813,
"logits/rejected": -1.3147923946380615,
"logps/chosen": -521.7337646484375,
"logps/rejected": -674.8948364257812,
"loss": 0.4302,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.260921001434326,
"rewards/margins": 2.081796646118164,
"rewards/rejected": -4.34271764755249,
"step": 3490
},
{
"epoch": 0.8397312859884837,
"grad_norm": 14.173962004603878,
"learning_rate": 3.811629156755541e-08,
"logits/chosen": -1.160355567932129,
"logits/rejected": -1.185987949371338,
"logps/chosen": -477.21728515625,
"logps/rejected": -679.4395751953125,
"loss": 0.4274,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.9270728826522827,
"rewards/margins": 2.1360950469970703,
"rewards/rejected": -4.063167572021484,
"step": 3500
},
{
"epoch": 0.8421305182341651,
"grad_norm": 11.942462848386326,
"learning_rate": 3.701245352256391e-08,
"logits/chosen": -1.2002038955688477,
"logits/rejected": -1.324733018875122,
"logps/chosen": -478.535400390625,
"logps/rejected": -557.5496826171875,
"loss": 0.4283,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.8901561498641968,
"rewards/margins": 1.0154350996017456,
"rewards/rejected": -2.9055914878845215,
"step": 3510
},
{
"epoch": 0.8445297504798465,
"grad_norm": 20.5827327935098,
"learning_rate": 3.592355570776984e-08,
"logits/chosen": -1.173332929611206,
"logits/rejected": -1.2609224319458008,
"logps/chosen": -360.0851745605469,
"logps/rejected": -583.3133544921875,
"loss": 0.4195,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.4041774272918701,
"rewards/margins": 2.1920089721679688,
"rewards/rejected": -3.5961861610412598,
"step": 3520
},
{
"epoch": 0.8469289827255279,
"grad_norm": 10.471012375510664,
"learning_rate": 3.484967450502904e-08,
"logits/chosen": -1.1107840538024902,
"logits/rejected": -1.2250497341156006,
"logps/chosen": -374.8684997558594,
"logps/rejected": -664.1340942382812,
"loss": 0.4038,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.7912622690200806,
"rewards/margins": 2.5201168060302734,
"rewards/rejected": -4.3113789558410645,
"step": 3530
},
{
"epoch": 0.8493282149712092,
"grad_norm": 20.111970761124727,
"learning_rate": 3.3790885242841296e-08,
"logits/chosen": -1.0951917171478271,
"logits/rejected": -1.1834380626678467,
"logps/chosen": -451.64190673828125,
"logps/rejected": -770.8523559570312,
"loss": 0.3789,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.0603652000427246,
"rewards/margins": 3.2534384727478027,
"rewards/rejected": -5.313803672790527,
"step": 3540
},
{
"epoch": 0.8517274472168906,
"grad_norm": 15.121338349842414,
"learning_rate": 3.274726219106677e-08,
"logits/chosen": -1.051578402519226,
"logits/rejected": -1.135506272315979,
"logps/chosen": -485.54840087890625,
"logps/rejected": -721.75439453125,
"loss": 0.4442,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.1253767013549805,
"rewards/margins": 2.3960883617401123,
"rewards/rejected": -4.521464824676514,
"step": 3550
},
{
"epoch": 0.8541266794625719,
"grad_norm": 15.152890802383466,
"learning_rate": 3.171887855571642e-08,
"logits/chosen": -1.2348651885986328,
"logits/rejected": -1.2165499925613403,
"logps/chosen": -395.697509765625,
"logps/rejected": -543.8568115234375,
"loss": 0.3826,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7078378200531006,
"rewards/margins": 1.5240461826324463,
"rewards/rejected": -3.231884002685547,
"step": 3560
},
{
"epoch": 0.8565259117082533,
"grad_norm": 24.44597593565566,
"learning_rate": 3.070580647381643e-08,
"logits/chosen": -1.1522376537322998,
"logits/rejected": -1.2483961582183838,
"logps/chosen": -406.069091796875,
"logps/rejected": -749.8062133789062,
"loss": 0.4548,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.753334403038025,
"rewards/margins": 3.3889122009277344,
"rewards/rejected": -5.142246246337891,
"step": 3570
},
{
"epoch": 0.8589251439539347,
"grad_norm": 15.075419291770242,
"learning_rate": 2.9708117008348576e-08,
"logits/chosen": -1.2388461828231812,
"logits/rejected": -1.3630428314208984,
"logps/chosen": -477.43585205078125,
"logps/rejected": -610.2297973632812,
"loss": 0.3969,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.715201735496521,
"rewards/margins": 1.8291780948638916,
"rewards/rejected": -3.544379711151123,
"step": 3580
},
{
"epoch": 0.8613243761996161,
"grad_norm": 11.602973232421764,
"learning_rate": 2.8725880143264992e-08,
"logits/chosen": -1.19898521900177,
"logits/rejected": -1.2185986042022705,
"logps/chosen": -449.32598876953125,
"logps/rejected": -633.5596923828125,
"loss": 0.465,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.1087489128112793,
"rewards/margins": 1.4358699321746826,
"rewards/rejected": -3.544618606567383,
"step": 3590
},
{
"epoch": 0.8637236084452975,
"grad_norm": 21.3005453219283,
"learning_rate": 2.775916477857948e-08,
"logits/chosen": -1.1370158195495605,
"logits/rejected": -1.1749649047851562,
"logps/chosen": -402.0398254394531,
"logps/rejected": -587.6260375976562,
"loss": 0.413,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9864225387573242,
"rewards/margins": 1.8402111530303955,
"rewards/rejected": -3.8266334533691406,
"step": 3600
},
{
"epoch": 0.8661228406909789,
"grad_norm": 15.387678499543219,
"learning_rate": 2.680803872553408e-08,
"logits/chosen": -1.2096471786499023,
"logits/rejected": -1.3197107315063477,
"logps/chosen": -445.0570373535156,
"logps/rejected": -827.9861450195312,
"loss": 0.4225,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.9052565097808838,
"rewards/margins": 4.059884071350098,
"rewards/rejected": -5.965140342712402,
"step": 3610
},
{
"epoch": 0.8685220729366603,
"grad_norm": 24.644670995274364,
"learning_rate": 2.5872568701842706e-08,
"logits/chosen": -1.2481223344802856,
"logits/rejected": -1.3282666206359863,
"logps/chosen": -388.14715576171875,
"logps/rejected": -630.1165161132812,
"loss": 0.4617,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7502208948135376,
"rewards/margins": 2.1968894004821777,
"rewards/rejected": -3.9471099376678467,
"step": 3620
},
{
"epoch": 0.8709213051823417,
"grad_norm": 20.103907535930773,
"learning_rate": 2.495282032701096e-08,
"logits/chosen": -1.1463677883148193,
"logits/rejected": -1.3338615894317627,
"logps/chosen": -351.9326477050781,
"logps/rejected": -538.684814453125,
"loss": 0.4117,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.522869348526001,
"rewards/margins": 2.161909580230713,
"rewards/rejected": -3.684778928756714,
"step": 3630
},
{
"epoch": 0.8733205374280231,
"grad_norm": 14.949192367966893,
"learning_rate": 2.4048858117733133e-08,
"logits/chosen": -1.2300177812576294,
"logits/rejected": -1.345139741897583,
"logps/chosen": -454.37628173828125,
"logps/rejected": -743.2384033203125,
"loss": 0.3729,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8625742197036743,
"rewards/margins": 3.337106227874756,
"rewards/rejected": -5.199681282043457,
"step": 3640
},
{
"epoch": 0.8757197696737045,
"grad_norm": 17.259398265317675,
"learning_rate": 2.3160745483366938e-08,
"logits/chosen": -1.2201354503631592,
"logits/rejected": -1.259948492050171,
"logps/chosen": -431.505126953125,
"logps/rejected": -647.8377685546875,
"loss": 0.4208,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.0006277561187744,
"rewards/margins": 1.8061168193817139,
"rewards/rejected": -3.806744337081909,
"step": 3650
},
{
"epoch": 0.8781190019193857,
"grad_norm": 26.593038878856742,
"learning_rate": 2.2288544721485197e-08,
"logits/chosen": -1.1473274230957031,
"logits/rejected": -1.1531964540481567,
"logps/chosen": -367.77691650390625,
"logps/rejected": -670.8995361328125,
"loss": 0.4029,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.509234070777893,
"rewards/margins": 2.8308663368225098,
"rewards/rejected": -4.340100288391113,
"step": 3660
},
{
"epoch": 0.8805182341650671,
"grad_norm": 17.09126226618081,
"learning_rate": 2.1432317013506117e-08,
"logits/chosen": -1.2680007219314575,
"logits/rejected": -1.3855565786361694,
"logps/chosen": -459.4864807128906,
"logps/rejected": -617.2786254882812,
"loss": 0.4376,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.9778245687484741,
"rewards/margins": 1.9757041931152344,
"rewards/rejected": -3.953528881072998,
"step": 3670
},
{
"epoch": 0.8829174664107485,
"grad_norm": 22.964518655362124,
"learning_rate": 2.0592122420401704e-08,
"logits/chosen": -1.0826637744903564,
"logits/rejected": -1.218787431716919,
"logps/chosen": -442.7049865722656,
"logps/rejected": -622.1304931640625,
"loss": 0.4426,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.082751750946045,
"rewards/margins": 1.7740720510482788,
"rewards/rejected": -3.8568243980407715,
"step": 3680
},
{
"epoch": 0.8853166986564299,
"grad_norm": 16.601339844348978,
"learning_rate": 1.976801987848459e-08,
"logits/chosen": -1.2238892316818237,
"logits/rejected": -1.2646934986114502,
"logps/chosen": -454.4891052246094,
"logps/rejected": -779.9896850585938,
"loss": 0.4293,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.8976598978042603,
"rewards/margins": 3.114412784576416,
"rewards/rejected": -5.012072563171387,
"step": 3690
},
{
"epoch": 0.8877159309021113,
"grad_norm": 17.11872715651324,
"learning_rate": 1.8960067195273987e-08,
"logits/chosen": -1.2767010927200317,
"logits/rejected": -1.3807860612869263,
"logps/chosen": -399.5985412597656,
"logps/rejected": -693.5151977539062,
"loss": 0.3976,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.8493973016738892,
"rewards/margins": 2.9295060634613037,
"rewards/rejected": -4.778903484344482,
"step": 3700
},
{
"epoch": 0.8901151631477927,
"grad_norm": 16.6413432588174,
"learning_rate": 1.816832104544072e-08,
"logits/chosen": -1.098815679550171,
"logits/rejected": -1.1592333316802979,
"logps/chosen": -470.98553466796875,
"logps/rejected": -625.4569091796875,
"loss": 0.3886,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.982060432434082,
"rewards/margins": 1.7682058811187744,
"rewards/rejected": -3.7502663135528564,
"step": 3710
},
{
"epoch": 0.8925143953934741,
"grad_norm": 11.891797748013655,
"learning_rate": 1.7392836966831553e-08,
"logits/chosen": -1.0697181224822998,
"logits/rejected": -1.194319486618042,
"logps/chosen": -475.51898193359375,
"logps/rejected": -717.6954345703125,
"loss": 0.3918,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.0813956260681152,
"rewards/margins": 2.7703046798706055,
"rewards/rejected": -4.851700782775879,
"step": 3720
},
{
"epoch": 0.8949136276391555,
"grad_norm": 17.887300709912445,
"learning_rate": 1.663366935657373e-08,
"logits/chosen": -1.2668213844299316,
"logits/rejected": -1.4020304679870605,
"logps/chosen": -410.6656188964844,
"logps/rejected": -631.67041015625,
"loss": 0.4465,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.830963373184204,
"rewards/margins": 2.1420297622680664,
"rewards/rejected": -3.9729931354522705,
"step": 3730
},
{
"epoch": 0.8973128598848369,
"grad_norm": 21.650200131935442,
"learning_rate": 1.5890871467258898e-08,
"logits/chosen": -1.0380961894989014,
"logits/rejected": -1.1259255409240723,
"logps/chosen": -538.32568359375,
"logps/rejected": -709.1238403320312,
"loss": 0.4203,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.206559896469116,
"rewards/margins": 2.0453429222106934,
"rewards/rejected": -4.2519025802612305,
"step": 3740
},
{
"epoch": 0.8997120921305183,
"grad_norm": 12.405877408803793,
"learning_rate": 1.5164495403207967e-08,
"logits/chosen": -1.2166404724121094,
"logits/rejected": -1.2211335897445679,
"logps/chosen": -480.22528076171875,
"logps/rejected": -792.5941162109375,
"loss": 0.3954,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.314863443374634,
"rewards/margins": 2.8861892223358154,
"rewards/rejected": -5.201052665710449,
"step": 3750
},
{
"epoch": 0.9021113243761996,
"grad_norm": 12.728466226110546,
"learning_rate": 1.4454592116815962e-08,
"logits/chosen": -1.1239063739776611,
"logits/rejected": -1.1524550914764404,
"logps/chosen": -412.11602783203125,
"logps/rejected": -646.6449584960938,
"loss": 0.3604,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.66254460811615,
"rewards/margins": 2.2538654804229736,
"rewards/rejected": -3.916410446166992,
"step": 3760
},
{
"epoch": 0.904510556621881,
"grad_norm": 11.308755733612493,
"learning_rate": 1.3761211404977934e-08,
"logits/chosen": -1.2462382316589355,
"logits/rejected": -1.2617241144180298,
"logps/chosen": -481.52642822265625,
"logps/rejected": -763.2169189453125,
"loss": 0.3466,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.5226385593414307,
"rewards/margins": 2.925380229949951,
"rewards/rejected": -5.448019504547119,
"step": 3770
},
{
"epoch": 0.9069097888675623,
"grad_norm": 20.409261229736146,
"learning_rate": 1.3084401905596177e-08,
"logits/chosen": -1.1374547481536865,
"logits/rejected": -1.3039876222610474,
"logps/chosen": -499.5015563964844,
"logps/rejected": -709.859375,
"loss": 0.4434,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.0284790992736816,
"rewards/margins": 2.706104040145874,
"rewards/rejected": -4.734583377838135,
"step": 3780
},
{
"epoch": 0.9093090211132437,
"grad_norm": 17.1782781112593,
"learning_rate": 1.2424211094168053e-08,
"logits/chosen": -1.1101362705230713,
"logits/rejected": -1.2460038661956787,
"logps/chosen": -517.1344604492188,
"logps/rejected": -742.2302856445312,
"loss": 0.4041,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.067178726196289,
"rewards/margins": 2.329007625579834,
"rewards/rejected": -4.396185874938965,
"step": 3790
},
{
"epoch": 0.9117082533589251,
"grad_norm": 30.065326876665342,
"learning_rate": 1.1780685280456143e-08,
"logits/chosen": -1.237917184829712,
"logits/rejected": -1.3036408424377441,
"logps/chosen": -539.2681274414062,
"logps/rejected": -925.9050903320312,
"loss": 0.4439,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.540894031524658,
"rewards/margins": 3.8281638622283936,
"rewards/rejected": -6.369057655334473,
"step": 3800
},
{
"epoch": 0.9141074856046065,
"grad_norm": 20.319800037171195,
"learning_rate": 1.1153869605239564e-08,
"logits/chosen": -1.231994390487671,
"logits/rejected": -1.3546103239059448,
"logps/chosen": -441.52520751953125,
"logps/rejected": -568.5699462890625,
"loss": 0.4091,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.8595590591430664,
"rewards/margins": 1.6289126873016357,
"rewards/rejected": -3.488471508026123,
"step": 3810
},
{
"epoch": 0.9165067178502879,
"grad_norm": 17.420902765437226,
"learning_rate": 1.0543808037147606e-08,
"logits/chosen": -1.2463701963424683,
"logits/rejected": -1.2932254076004028,
"logps/chosen": -475.19195556640625,
"logps/rejected": -830.6253051757812,
"loss": 0.394,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.176619052886963,
"rewards/margins": 3.5470027923583984,
"rewards/rejected": -5.7236223220825195,
"step": 3820
},
{
"epoch": 0.9189059500959693,
"grad_norm": 13.877437771957279,
"learning_rate": 9.95054336957557e-09,
"logits/chosen": -1.245715618133545,
"logits/rejected": -1.245986819267273,
"logps/chosen": -431.06024169921875,
"logps/rejected": -607.4310302734375,
"loss": 0.4043,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.873225450515747,
"rewards/margins": 1.7056442499160767,
"rewards/rejected": -3.578869581222534,
"step": 3830
},
{
"epoch": 0.9213051823416507,
"grad_norm": 22.780099057725195,
"learning_rate": 9.37411721768286e-09,
"logits/chosen": -1.3027960062026978,
"logits/rejected": -1.3543469905853271,
"logps/chosen": -500.34527587890625,
"logps/rejected": -797.4141845703125,
"loss": 0.3995,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.248213291168213,
"rewards/margins": 2.695244312286377,
"rewards/rejected": -4.943457126617432,
"step": 3840
},
{
"epoch": 0.9237044145873321,
"grad_norm": 18.879115732312695,
"learning_rate": 8.81457001547392e-09,
"logits/chosen": -1.179421067237854,
"logits/rejected": -1.2008545398712158,
"logps/chosen": -445.45611572265625,
"logps/rejected": -636.3636474609375,
"loss": 0.3706,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.106031894683838,
"rewards/margins": 1.8034794330596924,
"rewards/rejected": -3.909511089324951,
"step": 3850
},
{
"epoch": 0.9261036468330134,
"grad_norm": 13.946836764722176,
"learning_rate": 8.271941012961942e-09,
"logits/chosen": -1.1962236166000366,
"logits/rejected": -1.1757996082305908,
"logps/chosen": -434.50238037109375,
"logps/rejected": -851.7151489257812,
"loss": 0.4029,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.2081894874572754,
"rewards/margins": 3.6091766357421875,
"rewards/rejected": -5.817366600036621,
"step": 3860
},
{
"epoch": 0.9285028790786948,
"grad_norm": 21.809897538914896,
"learning_rate": 7.746268273415568e-09,
"logits/chosen": -1.3298779726028442,
"logits/rejected": -1.2550171613693237,
"logps/chosen": -448.08612060546875,
"logps/rejected": -611.9649658203125,
"loss": 0.3966,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8825676441192627,
"rewards/margins": 1.1555382013320923,
"rewards/rejected": -3.0381054878234863,
"step": 3870
},
{
"epoch": 0.9309021113243762,
"grad_norm": 13.467439618498904,
"learning_rate": 7.237588670689076e-09,
"logits/chosen": -1.2014961242675781,
"logits/rejected": -1.348229169845581,
"logps/chosen": -491.67352294921875,
"logps/rejected": -760.8187866210938,
"loss": 0.3867,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.231884479522705,
"rewards/margins": 3.0948691368103027,
"rewards/rejected": -5.32675313949585,
"step": 3880
},
{
"epoch": 0.9333013435700576,
"grad_norm": 17.106585504517614,
"learning_rate": 6.745937886635606e-09,
"logits/chosen": -1.239768624305725,
"logits/rejected": -1.2870354652404785,
"logps/chosen": -483.9998474121094,
"logps/rejected": -888.3824462890625,
"loss": 0.4068,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.029411554336548,
"rewards/margins": 4.014785289764404,
"rewards/rejected": -6.044196128845215,
"step": 3890
},
{
"epoch": 0.935700575815739,
"grad_norm": 17.65443575752541,
"learning_rate": 6.271350408604409e-09,
"logits/chosen": -1.2722991704940796,
"logits/rejected": -1.2935806512832642,
"logps/chosen": -389.75628662109375,
"logps/rejected": -633.9022216796875,
"loss": 0.3956,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.7770601511001587,
"rewards/margins": 2.264246702194214,
"rewards/rejected": -4.041306495666504,
"step": 3900
},
{
"epoch": 0.9380998080614203,
"grad_norm": 12.612133126164304,
"learning_rate": 5.813859527021487e-09,
"logits/chosen": -1.22406804561615,
"logits/rejected": -1.323305606842041,
"logps/chosen": -508.7425842285156,
"logps/rejected": -782.00830078125,
"loss": 0.338,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -2.409405469894409,
"rewards/margins": 3.0094313621520996,
"rewards/rejected": -5.418837070465088,
"step": 3910
},
{
"epoch": 0.9404990403071017,
"grad_norm": 13.294898316790011,
"learning_rate": 5.373497333054616e-09,
"logits/chosen": -1.2985012531280518,
"logits/rejected": -1.3380292654037476,
"logps/chosen": -493.2757873535156,
"logps/rejected": -623.1602783203125,
"loss": 0.4384,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.2983577251434326,
"rewards/margins": 1.3695790767669678,
"rewards/rejected": -3.6679370403289795,
"step": 3920
},
{
"epoch": 0.9428982725527831,
"grad_norm": 15.284137135135785,
"learning_rate": 4.950294716362213e-09,
"logits/chosen": -1.2158329486846924,
"logits/rejected": -1.3253077268600464,
"logps/chosen": -508.41424560546875,
"logps/rejected": -638.3739624023438,
"loss": 0.4209,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.2684988975524902,
"rewards/margins": 1.3235927820205688,
"rewards/rejected": -3.5920920372009277,
"step": 3930
},
{
"epoch": 0.9452975047984645,
"grad_norm": 15.219945748745712,
"learning_rate": 4.544281362926422e-09,
"logits/chosen": -1.1814640760421753,
"logits/rejected": -1.213180422782898,
"logps/chosen": -480.6612854003906,
"logps/rejected": -714.3478393554688,
"loss": 0.443,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.8659789562225342,
"rewards/margins": 2.4482996463775635,
"rewards/rejected": -4.314279079437256,
"step": 3940
},
{
"epoch": 0.9476967370441459,
"grad_norm": 12.199705253251745,
"learning_rate": 4.15548575297095e-09,
"logits/chosen": -1.1539068222045898,
"logits/rejected": -1.2610228061676025,
"logps/chosen": -470.3968811035156,
"logps/rejected": -807.7662353515625,
"loss": 0.3508,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.2147834300994873,
"rewards/margins": 3.456162214279175,
"rewards/rejected": -5.6709465980529785,
"step": 3950
},
{
"epoch": 0.9500959692898272,
"grad_norm": 9.529613600157132,
"learning_rate": 3.7839351589631366e-09,
"logits/chosen": -1.2287517786026,
"logits/rejected": -1.1175611019134521,
"logps/chosen": -470.9747009277344,
"logps/rejected": -738.1583251953125,
"loss": 0.4044,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.675825595855713,
"rewards/margins": 2.222598075866699,
"rewards/rejected": -4.89842414855957,
"step": 3960
},
{
"epoch": 0.9524952015355086,
"grad_norm": 18.605428038684874,
"learning_rate": 3.4296556437010405e-09,
"logits/chosen": -1.2765130996704102,
"logits/rejected": -1.2980254888534546,
"logps/chosen": -460.11541748046875,
"logps/rejected": -656.4251098632812,
"loss": 0.4182,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.518393039703369,
"rewards/margins": 2.0247464179992676,
"rewards/rejected": -4.543139457702637,
"step": 3970
},
{
"epoch": 0.95489443378119,
"grad_norm": 14.363883415576543,
"learning_rate": 3.092672058485124e-09,
"logits/chosen": -1.3632985353469849,
"logits/rejected": -1.3597389459609985,
"logps/chosen": -513.4880981445312,
"logps/rejected": -859.6558837890625,
"loss": 0.4086,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.856234550476074,
"rewards/margins": 3.326939821243286,
"rewards/rejected": -6.183174133300781,
"step": 3980
},
{
"epoch": 0.9572936660268714,
"grad_norm": 24.170474346453393,
"learning_rate": 2.7730080413750356e-09,
"logits/chosen": -1.1727163791656494,
"logits/rejected": -1.296337604522705,
"logps/chosen": -467.4671325683594,
"logps/rejected": -634.9149169921875,
"loss": 0.4019,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.021527051925659,
"rewards/margins": 1.6675231456756592,
"rewards/rejected": -3.6890506744384766,
"step": 3990
},
{
"epoch": 0.9596928982725528,
"grad_norm": 15.145186840475311,
"learning_rate": 2.4706860155316033e-09,
"logits/chosen": -1.1688404083251953,
"logits/rejected": -1.2660518884658813,
"logps/chosen": -569.9762573242188,
"logps/rejected": -844.1318359375,
"loss": 0.3981,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.3475515842437744,
"rewards/margins": 2.774146556854248,
"rewards/rejected": -5.121697902679443,
"step": 4000
},
{
"epoch": 0.9620921305182342,
"grad_norm": 20.980446798559257,
"learning_rate": 2.185727187643843e-09,
"logits/chosen": -1.2156826257705688,
"logits/rejected": -1.2687807083129883,
"logps/chosen": -427.87774658203125,
"logps/rejected": -791.0867309570312,
"loss": 0.4573,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.1100106239318848,
"rewards/margins": 3.6505370140075684,
"rewards/rejected": -5.7605485916137695,
"step": 4010
},
{
"epoch": 0.9644913627639156,
"grad_norm": 26.09657808054691,
"learning_rate": 1.9181515464413434e-09,
"logits/chosen": -1.0822070837020874,
"logits/rejected": -1.1274657249450684,
"logps/chosen": -565.4129028320312,
"logps/rejected": -877.5753784179688,
"loss": 0.3914,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.3460755348205566,
"rewards/margins": 3.1658713817596436,
"rewards/rejected": -5.511946678161621,
"step": 4020
},
{
"epoch": 0.966890595009597,
"grad_norm": 20.358263015517405,
"learning_rate": 1.6679778612923302e-09,
"logits/chosen": -1.2023911476135254,
"logits/rejected": -1.3535006046295166,
"logps/chosen": -511.30792236328125,
"logps/rejected": -691.3760375976562,
"loss": 0.3992,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.373955011367798,
"rewards/margins": 1.693084716796875,
"rewards/rejected": -4.0670390129089355,
"step": 4030
},
{
"epoch": 0.9692898272552783,
"grad_norm": 17.09466339113468,
"learning_rate": 1.43522368088686e-09,
"logits/chosen": -1.2254010438919067,
"logits/rejected": -1.3339544534683228,
"logps/chosen": -497.1849670410156,
"logps/rejected": -889.703125,
"loss": 0.48,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.4439101219177246,
"rewards/margins": 3.9405925273895264,
"rewards/rejected": -6.384502410888672,
"step": 4040
},
{
"epoch": 0.9716890595009597,
"grad_norm": 17.680515818931642,
"learning_rate": 1.2199053320059993e-09,
"logits/chosen": -1.2316696643829346,
"logits/rejected": -1.2435214519500732,
"logps/chosen": -472.69000244140625,
"logps/rejected": -706.5465087890625,
"loss": 0.3977,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.054659605026245,
"rewards/margins": 2.194859504699707,
"rewards/rejected": -4.249519348144531,
"step": 4050
},
{
"epoch": 0.974088291746641,
"grad_norm": 19.737454721261717,
"learning_rate": 1.0220379183764338e-09,
"logits/chosen": -1.2599509954452515,
"logits/rejected": -1.2703198194503784,
"logps/chosen": -364.81658935546875,
"logps/rejected": -650.0493774414062,
"loss": 0.3815,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5390068292617798,
"rewards/margins": 2.7725987434387207,
"rewards/rejected": -4.311606407165527,
"step": 4060
},
{
"epoch": 0.9764875239923224,
"grad_norm": 20.629402096960852,
"learning_rate": 8.416353196111503e-10,
"logits/chosen": -1.2555500268936157,
"logits/rejected": -1.2429850101470947,
"logps/chosen": -496.2998046875,
"logps/rejected": -772.2160034179688,
"loss": 0.4317,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.5762507915496826,
"rewards/margins": 2.843451499938965,
"rewards/rejected": -5.419702053070068,
"step": 4070
},
{
"epoch": 0.9788867562380038,
"grad_norm": 21.54194458199129,
"learning_rate": 6.787101902356873e-10,
"logits/chosen": -1.3214588165283203,
"logits/rejected": -1.3182449340820312,
"logps/chosen": -515.1784057617188,
"logps/rejected": -786.1739501953125,
"loss": 0.4275,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.5369973182678223,
"rewards/margins": 2.521944999694824,
"rewards/rejected": -5.058941841125488,
"step": 4080
},
{
"epoch": 0.9812859884836852,
"grad_norm": 22.64402950848799,
"learning_rate": 5.332739588005953e-10,
"logits/chosen": -1.2550867795944214,
"logits/rejected": -1.322644591331482,
"logps/chosen": -376.3187561035156,
"logps/rejected": -727.2735595703125,
"loss": 0.4059,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7436511516571045,
"rewards/margins": 3.295630693435669,
"rewards/rejected": -5.039282321929932,
"step": 4090
},
{
"epoch": 0.9836852207293666,
"grad_norm": 22.391181670910232,
"learning_rate": 4.053368270797164e-10,
"logits/chosen": -1.2337408065795898,
"logits/rejected": -1.2769973278045654,
"logps/chosen": -468.5732421875,
"logps/rejected": -767.9414672851562,
"loss": 0.4017,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.362514019012451,
"rewards/margins": 3.0355606079101562,
"rewards/rejected": -5.398074150085449,
"step": 4100
},
{
"epoch": 0.986084452975048,
"grad_norm": 14.32266939448474,
"learning_rate": 2.949077693545354e-10,
"logits/chosen": -1.1863398551940918,
"logits/rejected": -1.2951616048812866,
"logps/chosen": -504.52801513671875,
"logps/rejected": -719.0045166015625,
"loss": 0.4678,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.333911895751953,
"rewards/margins": 1.8691895008087158,
"rewards/rejected": -4.203102111816406,
"step": 4110
},
{
"epoch": 0.9884836852207294,
"grad_norm": 23.976635069300592,
"learning_rate": 2.0199453178471047e-10,
"logits/chosen": -1.126481294631958,
"logits/rejected": -1.2799094915390015,
"logps/chosen": -538.2355346679688,
"logps/rejected": -603.18994140625,
"loss": 0.4332,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.318129062652588,
"rewards/margins": 1.0544296503067017,
"rewards/rejected": -3.372559070587158,
"step": 4120
},
{
"epoch": 0.9908829174664108,
"grad_norm": 40.95175275330808,
"learning_rate": 1.266036318647301e-10,
"logits/chosen": -1.2519207000732422,
"logits/rejected": -1.3095006942749023,
"logps/chosen": -540.7717895507812,
"logps/rejected": -778.435791015625,
"loss": 0.4474,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.4916446208953857,
"rewards/margins": 2.5904107093811035,
"rewards/rejected": -5.08205509185791,
"step": 4130
},
{
"epoch": 0.9932821497120922,
"grad_norm": 17.066173818194596,
"learning_rate": 6.874035796672339e-11,
"logits/chosen": -1.2024726867675781,
"logits/rejected": -1.2957074642181396,
"logps/chosen": -512.9348754882812,
"logps/rejected": -842.3238525390625,
"loss": 0.4125,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.164402961730957,
"rewards/margins": 3.843069076538086,
"rewards/rejected": -6.007472515106201,
"step": 4140
},
{
"epoch": 0.9956813819577736,
"grad_norm": 20.618667625472924,
"learning_rate": 2.8408768969423458e-11,
"logits/chosen": -1.2141971588134766,
"logits/rejected": -1.2368704080581665,
"logps/chosen": -452.35345458984375,
"logps/rejected": -660.5663452148438,
"loss": 0.3909,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.867018699645996,
"rewards/margins": 2.0285303592681885,
"rewards/rejected": -3.8955492973327637,
"step": 4150
},
{
"epoch": 0.9980806142034548,
"grad_norm": 21.52118343490649,
"learning_rate": 5.611693973617271e-12,
"logits/chosen": -1.2614226341247559,
"logits/rejected": -1.27662193775177,
"logps/chosen": -454.05596923828125,
"logps/rejected": -670.5745239257812,
"loss": 0.4453,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.251619338989258,
"rewards/margins": 2.0149483680725098,
"rewards/rejected": -4.266567707061768,
"step": 4160
},
{
"epoch": 1.0,
"step": 4168,
"total_flos": 0.0,
"train_loss": 0.4679888197991303,
"train_runtime": 14179.2142,
"train_samples_per_second": 9.406,
"train_steps_per_second": 0.294
}
],
"logging_steps": 10,
"max_steps": 4168,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}