llama3_l5_best_entropy / trainer_state.json
yakazimir's picture
Model save
ab11c61 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9978142076502732,
"eval_steps": 400,
"global_step": 914,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01092896174863388,
"grad_norm": 47.87060782291424,
"learning_rate": 5.434782608695652e-08,
"logits/chosen": -1.0122432708740234,
"logits/rejected": -1.0073297023773193,
"logps/chosen": -0.28066128492355347,
"logps/rejected": -0.2858629524707794,
"loss": 3.1518,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.806612968444824,
"rewards/margins": 0.05201658606529236,
"rewards/rejected": -2.8586294651031494,
"semantic_entropy": 0.7517332434654236,
"step": 5
},
{
"epoch": 0.02185792349726776,
"grad_norm": 63.59519845931534,
"learning_rate": 1.0869565217391303e-07,
"logits/chosen": -1.0451396703720093,
"logits/rejected": -0.9949606657028198,
"logps/chosen": -0.25711697340011597,
"logps/rejected": -0.27150270342826843,
"loss": 3.1207,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.57116961479187,
"rewards/margins": 0.14385755360126495,
"rewards/rejected": -2.715027332305908,
"semantic_entropy": 0.7098506689071655,
"step": 10
},
{
"epoch": 0.03278688524590164,
"grad_norm": 52.932404991436066,
"learning_rate": 1.6304347826086955e-07,
"logits/chosen": -1.0101398229599,
"logits/rejected": -0.9632788896560669,
"logps/chosen": -0.2672443389892578,
"logps/rejected": -0.2731854319572449,
"loss": 3.1124,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.6724436283111572,
"rewards/margins": 0.05941082164645195,
"rewards/rejected": -2.731854200363159,
"semantic_entropy": 0.7272862195968628,
"step": 15
},
{
"epoch": 0.04371584699453552,
"grad_norm": 68.70297338794734,
"learning_rate": 2.1739130434782607e-07,
"logits/chosen": -0.946621298789978,
"logits/rejected": -0.8962594270706177,
"logps/chosen": -0.2722616195678711,
"logps/rejected": -0.2844754159450531,
"loss": 3.1543,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.722616195678711,
"rewards/margins": 0.12213809788227081,
"rewards/rejected": -2.844754219055176,
"semantic_entropy": 0.7445966601371765,
"step": 20
},
{
"epoch": 0.0546448087431694,
"grad_norm": 34.23797136353184,
"learning_rate": 2.717391304347826e-07,
"logits/chosen": -0.9447389841079712,
"logits/rejected": -0.8695358037948608,
"logps/chosen": -0.27488625049591064,
"logps/rejected": -0.29340118169784546,
"loss": 3.1248,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.7488627433776855,
"rewards/margins": 0.1851491630077362,
"rewards/rejected": -2.934011936187744,
"semantic_entropy": 0.753722071647644,
"step": 25
},
{
"epoch": 0.06557377049180328,
"grad_norm": 56.95442636508264,
"learning_rate": 3.260869565217391e-07,
"logits/chosen": -1.0504213571548462,
"logits/rejected": -0.9853544235229492,
"logps/chosen": -0.26506370306015015,
"logps/rejected": -0.2821282744407654,
"loss": 3.1282,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.650637149810791,
"rewards/margins": 0.1706458032131195,
"rewards/rejected": -2.8212831020355225,
"semantic_entropy": 0.7199792861938477,
"step": 30
},
{
"epoch": 0.07650273224043716,
"grad_norm": 54.514089612724746,
"learning_rate": 3.8043478260869567e-07,
"logits/chosen": -1.0058822631835938,
"logits/rejected": -0.9390825033187866,
"logps/chosen": -0.2544824182987213,
"logps/rejected": -0.2758719325065613,
"loss": 3.1,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.5448241233825684,
"rewards/margins": 0.2138955146074295,
"rewards/rejected": -2.7587194442749023,
"semantic_entropy": 0.714081346988678,
"step": 35
},
{
"epoch": 0.08743169398907104,
"grad_norm": 61.13897060157166,
"learning_rate": 4.3478260869565214e-07,
"logits/chosen": -0.9637517929077148,
"logits/rejected": -0.9011168479919434,
"logps/chosen": -0.28103750944137573,
"logps/rejected": -0.29354166984558105,
"loss": 3.1681,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.8103749752044678,
"rewards/margins": 0.12504148483276367,
"rewards/rejected": -2.9354166984558105,
"semantic_entropy": 0.7535971999168396,
"step": 40
},
{
"epoch": 0.09836065573770492,
"grad_norm": 29.50202425422368,
"learning_rate": 4.891304347826087e-07,
"logits/chosen": -1.011054515838623,
"logits/rejected": -0.9284116625785828,
"logps/chosen": -0.28203994035720825,
"logps/rejected": -0.3046588599681854,
"loss": 3.106,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -2.820399522781372,
"rewards/margins": 0.2261890470981598,
"rewards/rejected": -3.04658842086792,
"semantic_entropy": 0.7553126811981201,
"step": 45
},
{
"epoch": 0.1092896174863388,
"grad_norm": 60.818918802477036,
"learning_rate": 5.434782608695652e-07,
"logits/chosen": -0.9375956654548645,
"logits/rejected": -0.8574072122573853,
"logps/chosen": -0.2780763804912567,
"logps/rejected": -0.28224700689315796,
"loss": 3.1338,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -2.780763864517212,
"rewards/margins": 0.04170636087656021,
"rewards/rejected": -2.822470188140869,
"semantic_entropy": 0.7434889078140259,
"step": 50
},
{
"epoch": 0.12021857923497267,
"grad_norm": 34.29716426184461,
"learning_rate": 5.978260869565217e-07,
"logits/chosen": -0.9751367568969727,
"logits/rejected": -0.8606834411621094,
"logps/chosen": -0.2696499526500702,
"logps/rejected": -0.29947254061698914,
"loss": 3.0524,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.6964995861053467,
"rewards/margins": 0.2982260584831238,
"rewards/rejected": -2.9947257041931152,
"semantic_entropy": 0.7428679466247559,
"step": 55
},
{
"epoch": 0.13114754098360656,
"grad_norm": 32.36546820788893,
"learning_rate": 6.521739130434782e-07,
"logits/chosen": -1.0148303508758545,
"logits/rejected": -0.9685667157173157,
"logps/chosen": -0.25762075185775757,
"logps/rejected": -0.2997520864009857,
"loss": 3.0039,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.576206922531128,
"rewards/margins": 0.42131391167640686,
"rewards/rejected": -2.997521162033081,
"semantic_entropy": 0.7362821102142334,
"step": 60
},
{
"epoch": 0.14207650273224043,
"grad_norm": 47.86126164856308,
"learning_rate": 7.065217391304348e-07,
"logits/chosen": -1.002937912940979,
"logits/rejected": -0.9363768696784973,
"logps/chosen": -0.2962821125984192,
"logps/rejected": -0.3176509141921997,
"loss": 3.0992,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.9628207683563232,
"rewards/margins": 0.2136881798505783,
"rewards/rejected": -3.176509141921997,
"semantic_entropy": 0.7823900580406189,
"step": 65
},
{
"epoch": 0.15300546448087432,
"grad_norm": 83.46398772579433,
"learning_rate": 7.608695652173913e-07,
"logits/chosen": -0.9694533348083496,
"logits/rejected": -0.9480490684509277,
"logps/chosen": -0.2837492823600769,
"logps/rejected": -0.3052641451358795,
"loss": 3.0367,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.8374929428100586,
"rewards/margins": 0.2151484489440918,
"rewards/rejected": -3.0526413917541504,
"semantic_entropy": 0.7394664883613586,
"step": 70
},
{
"epoch": 0.16393442622950818,
"grad_norm": 35.83270782611293,
"learning_rate": 8.152173913043478e-07,
"logits/chosen": -0.9647692441940308,
"logits/rejected": -0.9482067227363586,
"logps/chosen": -0.2907211184501648,
"logps/rejected": -0.33229631185531616,
"loss": 3.0658,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.9072110652923584,
"rewards/margins": 0.41575226187705994,
"rewards/rejected": -3.322962999343872,
"semantic_entropy": 0.7694975733757019,
"step": 75
},
{
"epoch": 0.17486338797814208,
"grad_norm": 52.413564512749005,
"learning_rate": 8.695652173913043e-07,
"logits/chosen": -0.9714950323104858,
"logits/rejected": -0.9107065200805664,
"logps/chosen": -0.2882896065711975,
"logps/rejected": -0.3103812336921692,
"loss": 3.0244,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.8828964233398438,
"rewards/margins": 0.22091606259346008,
"rewards/rejected": -3.1038122177124023,
"semantic_entropy": 0.7423045039176941,
"step": 80
},
{
"epoch": 0.18579234972677597,
"grad_norm": 57.128124235325,
"learning_rate": 9.239130434782608e-07,
"logits/chosen": -0.9738727807998657,
"logits/rejected": -0.9262188076972961,
"logps/chosen": -0.29303327202796936,
"logps/rejected": -0.337748646736145,
"loss": 3.0267,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.9303324222564697,
"rewards/margins": 0.44715413451194763,
"rewards/rejected": -3.37748646736145,
"semantic_entropy": 0.7571176290512085,
"step": 85
},
{
"epoch": 0.19672131147540983,
"grad_norm": 39.74743242931724,
"learning_rate": 9.782608695652173e-07,
"logits/chosen": -1.046452283859253,
"logits/rejected": -0.9666553735733032,
"logps/chosen": -0.31861579418182373,
"logps/rejected": -0.34951895475387573,
"loss": 3.0463,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -3.186157703399658,
"rewards/margins": 0.3090316653251648,
"rewards/rejected": -3.495189666748047,
"semantic_entropy": 0.8055832982063293,
"step": 90
},
{
"epoch": 0.20765027322404372,
"grad_norm": 51.89832814789265,
"learning_rate": 9.999671349822886e-07,
"logits/chosen": -0.9848623275756836,
"logits/rejected": -0.9856392741203308,
"logps/chosen": -0.31298893690109253,
"logps/rejected": -0.3401663601398468,
"loss": 2.9541,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -3.1298892498016357,
"rewards/margins": 0.2717742323875427,
"rewards/rejected": -3.4016640186309814,
"semantic_entropy": 0.7869037389755249,
"step": 95
},
{
"epoch": 0.2185792349726776,
"grad_norm": 69.97139505648609,
"learning_rate": 9.997663088532014e-07,
"logits/chosen": -0.9892705678939819,
"logits/rejected": -0.943418025970459,
"logps/chosen": -0.35917508602142334,
"logps/rejected": -0.4198976159095764,
"loss": 2.9725,
"rewards/accuracies": 0.5625,
"rewards/chosen": -3.5917506217956543,
"rewards/margins": 0.6072250008583069,
"rewards/rejected": -4.198975563049316,
"semantic_entropy": 0.834593653678894,
"step": 100
},
{
"epoch": 0.22950819672131148,
"grad_norm": 43.67519297008509,
"learning_rate": 9.9938298818292e-07,
"logits/chosen": -1.0403445959091187,
"logits/rejected": -1.0104751586914062,
"logps/chosen": -0.32551589608192444,
"logps/rejected": -0.38466745615005493,
"loss": 2.9376,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -3.2551589012145996,
"rewards/margins": 0.5915151834487915,
"rewards/rejected": -3.8466744422912598,
"semantic_entropy": 0.8123003840446472,
"step": 105
},
{
"epoch": 0.24043715846994534,
"grad_norm": 54.4822346164963,
"learning_rate": 9.98817312944725e-07,
"logits/chosen": -1.0293775796890259,
"logits/rejected": -1.0085766315460205,
"logps/chosen": -0.34657078981399536,
"logps/rejected": -0.44877204298973083,
"loss": 2.9452,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -3.465707778930664,
"rewards/margins": 1.0220123529434204,
"rewards/rejected": -4.487720012664795,
"semantic_entropy": 0.8509441614151001,
"step": 110
},
{
"epoch": 0.25136612021857924,
"grad_norm": 53.517455700291855,
"learning_rate": 9.98069489700446e-07,
"logits/chosen": -1.0341802835464478,
"logits/rejected": -0.9952918887138367,
"logps/chosen": -0.3461839258670807,
"logps/rejected": -0.4705514907836914,
"loss": 2.8994,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -3.4618396759033203,
"rewards/margins": 1.2436755895614624,
"rewards/rejected": -4.705514907836914,
"semantic_entropy": 0.8380171656608582,
"step": 115
},
{
"epoch": 0.26229508196721313,
"grad_norm": 40.64481855809536,
"learning_rate": 9.971397915250336e-07,
"logits/chosen": -1.0739099979400635,
"logits/rejected": -1.0038702487945557,
"logps/chosen": -0.3547818958759308,
"logps/rejected": -0.4196414053440094,
"loss": 2.8774,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -3.547818660736084,
"rewards/margins": 0.6485950350761414,
"rewards/rejected": -4.196413993835449,
"semantic_entropy": 0.8623871803283691,
"step": 120
},
{
"epoch": 0.273224043715847,
"grad_norm": 144.95477211017723,
"learning_rate": 9.960285579068417e-07,
"logits/chosen": -0.9688740968704224,
"logits/rejected": -0.9354850053787231,
"logps/chosen": -0.383869469165802,
"logps/rejected": -0.47563114762306213,
"loss": 2.8716,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -3.8386943340301514,
"rewards/margins": 0.9176166653633118,
"rewards/rejected": -4.756311416625977,
"semantic_entropy": 0.8745672106742859,
"step": 125
},
{
"epoch": 0.28415300546448086,
"grad_norm": 47.745102969069876,
"learning_rate": 9.94736194623663e-07,
"logits/chosen": -0.9936184883117676,
"logits/rejected": -0.9872056841850281,
"logps/chosen": -0.4027808606624603,
"logps/rejected": -0.5585031509399414,
"loss": 2.8889,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -4.02780818939209,
"rewards/margins": 1.5572230815887451,
"rewards/rejected": -5.585031032562256,
"semantic_entropy": 0.8549701571464539,
"step": 130
},
{
"epoch": 0.29508196721311475,
"grad_norm": 43.036244798527335,
"learning_rate": 9.932631735945526e-07,
"logits/chosen": -1.018587350845337,
"logits/rejected": -0.9396653175354004,
"logps/chosen": -0.3934100568294525,
"logps/rejected": -0.5400375127792358,
"loss": 2.8008,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -3.934100389480591,
"rewards/margins": 1.4662750959396362,
"rewards/rejected": -5.4003753662109375,
"semantic_entropy": 0.8907697796821594,
"step": 135
},
{
"epoch": 0.30601092896174864,
"grad_norm": 51.334063125222045,
"learning_rate": 9.916100327075037e-07,
"logits/chosen": -1.0269070863723755,
"logits/rejected": -0.9736196398735046,
"logps/chosen": -0.43043556809425354,
"logps/rejected": -0.6303533911705017,
"loss": 2.5701,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -4.304355621337891,
"rewards/margins": 1.9991786479949951,
"rewards/rejected": -6.303534507751465,
"semantic_entropy": 0.9288080930709839,
"step": 140
},
{
"epoch": 0.31693989071038253,
"grad_norm": 69.99654341210723,
"learning_rate": 9.89777375623032e-07,
"logits/chosen": -0.9977472424507141,
"logits/rejected": -0.9811614751815796,
"logps/chosen": -0.44030895829200745,
"logps/rejected": -0.5321138501167297,
"loss": 2.7244,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -4.4030890464782715,
"rewards/margins": 0.9180487394332886,
"rewards/rejected": -5.321138381958008,
"semantic_entropy": 0.932425856590271,
"step": 145
},
{
"epoch": 0.32786885245901637,
"grad_norm": 51.74709430626173,
"learning_rate": 9.877658715537428e-07,
"logits/chosen": -1.0553128719329834,
"logits/rejected": -1.0262110233306885,
"logps/chosen": -0.5291231870651245,
"logps/rejected": -0.7928577661514282,
"loss": 2.6042,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -5.291232109069824,
"rewards/margins": 2.637345552444458,
"rewards/rejected": -7.928577423095703,
"semantic_entropy": 0.9483098983764648,
"step": 150
},
{
"epoch": 0.33879781420765026,
"grad_norm": 59.40984432787828,
"learning_rate": 9.85576255019963e-07,
"logits/chosen": -1.0320864915847778,
"logits/rejected": -0.9819043278694153,
"logps/chosen": -0.5477417707443237,
"logps/rejected": -0.7481231093406677,
"loss": 2.5957,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -5.477417945861816,
"rewards/margins": 2.0038137435913086,
"rewards/rejected": -7.481231689453125,
"semantic_entropy": 0.9526890516281128,
"step": 155
},
{
"epoch": 0.34972677595628415,
"grad_norm": 63.33344115210913,
"learning_rate": 9.832093255815216e-07,
"logits/chosen": -1.0814168453216553,
"logits/rejected": -1.0304033756256104,
"logps/chosen": -0.6954716444015503,
"logps/rejected": -0.8502774238586426,
"loss": 2.6238,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -6.954716682434082,
"rewards/margins": 1.5480577945709229,
"rewards/rejected": -8.502774238586426,
"semantic_entropy": 0.9549511671066284,
"step": 160
},
{
"epoch": 0.36065573770491804,
"grad_norm": 62.82535328280916,
"learning_rate": 9.806659475457849e-07,
"logits/chosen": -1.0839955806732178,
"logits/rejected": -1.031585931777954,
"logps/chosen": -0.7121194005012512,
"logps/rejected": -0.8951581716537476,
"loss": 2.5445,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -7.121194362640381,
"rewards/margins": 1.8303883075714111,
"rewards/rejected": -8.951581954956055,
"semantic_entropy": 0.9896249771118164,
"step": 165
},
{
"epoch": 0.37158469945355194,
"grad_norm": 65.61173370500529,
"learning_rate": 9.779470496520441e-07,
"logits/chosen": -1.0843085050582886,
"logits/rejected": -1.0285215377807617,
"logps/chosen": -0.7273966670036316,
"logps/rejected": -0.9349418878555298,
"loss": 2.5832,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -7.2739667892456055,
"rewards/margins": 2.0754518508911133,
"rewards/rejected": -9.349418640136719,
"semantic_entropy": 0.9762886762619019,
"step": 170
},
{
"epoch": 0.3825136612021858,
"grad_norm": 48.476659698357665,
"learning_rate": 9.750536247323789e-07,
"logits/chosen": -1.1571153402328491,
"logits/rejected": -1.131704330444336,
"logps/chosen": -0.8265604972839355,
"logps/rejected": -0.9824529886245728,
"loss": 2.4619,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -8.265604019165039,
"rewards/margins": 1.5589253902435303,
"rewards/rejected": -9.824529647827148,
"semantic_entropy": 0.9426374435424805,
"step": 175
},
{
"epoch": 0.39344262295081966,
"grad_norm": 55.305744786201686,
"learning_rate": 9.719867293491144e-07,
"logits/chosen": -1.1452279090881348,
"logits/rejected": -1.1399190425872803,
"logps/chosen": -0.8152974843978882,
"logps/rejected": -1.16525137424469,
"loss": 2.3679,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -8.152975082397461,
"rewards/margins": 3.4995384216308594,
"rewards/rejected": -11.65251350402832,
"semantic_entropy": 0.9442623257637024,
"step": 180
},
{
"epoch": 0.40437158469945356,
"grad_norm": 50.733966507742444,
"learning_rate": 9.687474834090067e-07,
"logits/chosen": -1.1547253131866455,
"logits/rejected": -1.1736373901367188,
"logps/chosen": -0.8491543531417847,
"logps/rejected": -1.1844466924667358,
"loss": 2.3318,
"rewards/accuracies": 0.8125,
"rewards/chosen": -8.491543769836426,
"rewards/margins": 3.35292387008667,
"rewards/rejected": -11.844468116760254,
"semantic_entropy": 0.9556644558906555,
"step": 185
},
{
"epoch": 0.41530054644808745,
"grad_norm": 62.277237758824675,
"learning_rate": 9.653370697542987e-07,
"logits/chosen": -1.162003755569458,
"logits/rejected": -1.121468186378479,
"logps/chosen": -0.8294251561164856,
"logps/rejected": -1.1698486804962158,
"loss": 2.3649,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -8.294252395629883,
"rewards/margins": 3.4042346477508545,
"rewards/rejected": -11.698487281799316,
"semantic_entropy": 0.9534858465194702,
"step": 190
},
{
"epoch": 0.4262295081967213,
"grad_norm": 62.09032006268862,
"learning_rate": 9.617567337307935e-07,
"logits/chosen": -1.1882003545761108,
"logits/rejected": -1.1697113513946533,
"logps/chosen": -0.9817994236946106,
"logps/rejected": -1.3722710609436035,
"loss": 2.4013,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -9.817992210388184,
"rewards/margins": 3.904717206954956,
"rewards/rejected": -13.722711563110352,
"semantic_entropy": 0.9071667790412903,
"step": 195
},
{
"epoch": 0.4371584699453552,
"grad_norm": 54.79531932098139,
"learning_rate": 9.580077827331037e-07,
"logits/chosen": -1.160315990447998,
"logits/rejected": -1.0766620635986328,
"logps/chosen": -0.8970209360122681,
"logps/rejected": -1.2237987518310547,
"loss": 2.3542,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -8.970209121704102,
"rewards/margins": 3.267777919769287,
"rewards/rejected": -12.237987518310547,
"semantic_entropy": 0.9425733685493469,
"step": 200
},
{
"epoch": 0.44808743169398907,
"grad_norm": 45.857415331803075,
"learning_rate": 9.540915857272445e-07,
"logits/chosen": -1.120792269706726,
"logits/rejected": -1.1374807357788086,
"logps/chosen": -0.7932685017585754,
"logps/rejected": -1.1045658588409424,
"loss": 2.2801,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -7.932684898376465,
"rewards/margins": 3.112973690032959,
"rewards/rejected": -11.045658111572266,
"semantic_entropy": 0.9677651524543762,
"step": 205
},
{
"epoch": 0.45901639344262296,
"grad_norm": 71.89691225680161,
"learning_rate": 9.500095727507419e-07,
"logits/chosen": -1.1540464162826538,
"logits/rejected": -1.1580009460449219,
"logps/chosen": -0.8536632657051086,
"logps/rejected": -1.2688827514648438,
"loss": 2.1643,
"rewards/accuracies": 0.8125,
"rewards/chosen": -8.536632537841797,
"rewards/margins": 4.152195453643799,
"rewards/rejected": -12.688827514648438,
"semantic_entropy": 0.9133696556091309,
"step": 210
},
{
"epoch": 0.46994535519125685,
"grad_norm": 61.886918880598415,
"learning_rate": 9.457632343904402e-07,
"logits/chosen": -1.1507601737976074,
"logits/rejected": -1.0994901657104492,
"logps/chosen": -0.891444981098175,
"logps/rejected": -1.3195106983184814,
"loss": 2.2496,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -8.914449691772461,
"rewards/margins": 4.2806572914123535,
"rewards/rejected": -13.195106506347656,
"semantic_entropy": 0.943720817565918,
"step": 215
},
{
"epoch": 0.4808743169398907,
"grad_norm": 48.894845818998725,
"learning_rate": 9.413541212382004e-07,
"logits/chosen": -1.2136586904525757,
"logits/rejected": -1.1905956268310547,
"logps/chosen": -0.9255884289741516,
"logps/rejected": -1.2389224767684937,
"loss": 2.2122,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -9.255884170532227,
"rewards/margins": 3.133340358734131,
"rewards/rejected": -12.389223098754883,
"semantic_entropy": 0.9290882349014282,
"step": 220
},
{
"epoch": 0.4918032786885246,
"grad_norm": 53.07969298601074,
"learning_rate": 9.367838433246857e-07,
"logits/chosen": -1.2239024639129639,
"logits/rejected": -1.1851261854171753,
"logps/chosen": -0.8761332631111145,
"logps/rejected": -1.2777061462402344,
"loss": 2.1765,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -8.761332511901855,
"rewards/margins": 4.0157294273376465,
"rewards/rejected": -12.777061462402344,
"semantic_entropy": 0.9319503903388977,
"step": 225
},
{
"epoch": 0.5027322404371585,
"grad_norm": 51.09299897041373,
"learning_rate": 9.320540695314438e-07,
"logits/chosen": -1.1558756828308105,
"logits/rejected": -1.1598188877105713,
"logps/chosen": -0.8811947703361511,
"logps/rejected": -1.2912404537200928,
"loss": 2.2098,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -8.811946868896484,
"rewards/margins": 4.100456714630127,
"rewards/rejected": -12.91240406036377,
"semantic_entropy": 0.9310176968574524,
"step": 230
},
{
"epoch": 0.5136612021857924,
"grad_norm": 63.259306143827835,
"learning_rate": 9.271665269814983e-07,
"logits/chosen": -1.188391923904419,
"logits/rejected": -1.1512023210525513,
"logps/chosen": -0.8918437957763672,
"logps/rejected": -1.2489241361618042,
"loss": 2.1333,
"rewards/accuracies": 0.84375,
"rewards/chosen": -8.918437957763672,
"rewards/margins": 3.5708038806915283,
"rewards/rejected": -12.489240646362305,
"semantic_entropy": 0.9315102696418762,
"step": 235
},
{
"epoch": 0.5245901639344263,
"grad_norm": 55.527350378129,
"learning_rate": 9.221230004086721e-07,
"logits/chosen": -1.2678356170654297,
"logits/rejected": -1.2772780656814575,
"logps/chosen": -0.8592067956924438,
"logps/rejected": -1.3196837902069092,
"loss": 2.0237,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -8.592068672180176,
"rewards/margins": 4.604770660400391,
"rewards/rejected": -13.19683837890625,
"semantic_entropy": 0.9410519599914551,
"step": 240
},
{
"epoch": 0.5355191256830601,
"grad_norm": 46.907821708328406,
"learning_rate": 9.169253315058763e-07,
"logits/chosen": -1.1692125797271729,
"logits/rejected": -1.125632405281067,
"logps/chosen": -0.905608057975769,
"logps/rejected": -1.3867673873901367,
"loss": 2.1096,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -9.056081771850586,
"rewards/margins": 4.811593055725098,
"rewards/rejected": -13.867673873901367,
"semantic_entropy": 0.921157717704773,
"step": 245
},
{
"epoch": 0.546448087431694,
"grad_norm": 47.861862507896085,
"learning_rate": 9.11575418252596e-07,
"logits/chosen": -1.232251763343811,
"logits/rejected": -1.1941629648208618,
"logps/chosen": -0.8441025614738464,
"logps/rejected": -1.2240302562713623,
"loss": 2.1618,
"rewards/accuracies": 0.78125,
"rewards/chosen": -8.441025733947754,
"rewards/margins": 3.799276828765869,
"rewards/rejected": -12.240303039550781,
"semantic_entropy": 0.9252967834472656,
"step": 250
},
{
"epoch": 0.5573770491803278,
"grad_norm": 54.40105116628037,
"learning_rate": 9.060752142218257e-07,
"logits/chosen": -1.213555932044983,
"logits/rejected": -1.1773382425308228,
"logps/chosen": -0.8959819078445435,
"logps/rejected": -1.3679741621017456,
"loss": 2.0365,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -8.959817886352539,
"rewards/margins": 4.71992301940918,
"rewards/rejected": -13.679742813110352,
"semantic_entropy": 0.9322195053100586,
"step": 255
},
{
"epoch": 0.5683060109289617,
"grad_norm": 43.449537508765815,
"learning_rate": 9.004267278667031e-07,
"logits/chosen": -1.1810890436172485,
"logits/rejected": -1.1702289581298828,
"logps/chosen": -0.8510452508926392,
"logps/rejected": -1.3418259620666504,
"loss": 2.011,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -8.510452270507812,
"rewards/margins": 4.907806873321533,
"rewards/rejected": -13.41826057434082,
"semantic_entropy": 0.9143549203872681,
"step": 260
},
{
"epoch": 0.5792349726775956,
"grad_norm": 46.818755419193465,
"learning_rate": 8.946320217871025e-07,
"logits/chosen": -1.1749790906906128,
"logits/rejected": -1.1358766555786133,
"logps/chosen": -0.855148434638977,
"logps/rejected": -1.3291784524917603,
"loss": 1.9976,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -8.551485061645508,
"rewards/margins": 4.740299224853516,
"rewards/rejected": -13.291783332824707,
"semantic_entropy": 0.9298276901245117,
"step": 265
},
{
"epoch": 0.5901639344262295,
"grad_norm": 95.7954675499385,
"learning_rate": 8.886932119764565e-07,
"logits/chosen": -1.1698591709136963,
"logits/rejected": -1.1438281536102295,
"logps/chosen": -0.8544471859931946,
"logps/rejected": -1.377416968345642,
"loss": 1.9774,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -8.544472694396973,
"rewards/margins": 5.229698657989502,
"rewards/rejected": -13.774169921875,
"semantic_entropy": 0.9152740240097046,
"step": 270
},
{
"epoch": 0.6010928961748634,
"grad_norm": 52.4740552697882,
"learning_rate": 8.826124670490802e-07,
"logits/chosen": -1.140944242477417,
"logits/rejected": -1.0730197429656982,
"logps/chosen": -0.8467117547988892,
"logps/rejected": -1.2126039266586304,
"loss": 1.9796,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -8.467116355895996,
"rewards/margins": 3.658921718597412,
"rewards/rejected": -12.126038551330566,
"semantic_entropy": 0.933331310749054,
"step": 275
},
{
"epoch": 0.6120218579234973,
"grad_norm": 54.704164373442616,
"learning_rate": 8.763920074482809e-07,
"logits/chosen": -1.102807879447937,
"logits/rejected": -1.105039358139038,
"logps/chosen": -0.8896454572677612,
"logps/rejected": -1.4699008464813232,
"loss": 1.9808,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -8.896454811096191,
"rewards/margins": 5.802553176879883,
"rewards/rejected": -14.699007987976074,
"semantic_entropy": 0.8732292056083679,
"step": 280
},
{
"epoch": 0.6229508196721312,
"grad_norm": 48.33002211252601,
"learning_rate": 8.700341046355411e-07,
"logits/chosen": -1.2859059572219849,
"logits/rejected": -1.2477091550827026,
"logps/chosen": -0.8521019220352173,
"logps/rejected": -1.4364469051361084,
"loss": 1.8954,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -8.52101993560791,
"rewards/margins": 5.843448162078857,
"rewards/rejected": -14.364468574523926,
"semantic_entropy": 0.9044594764709473,
"step": 285
},
{
"epoch": 0.6338797814207651,
"grad_norm": 62.5830858895554,
"learning_rate": 8.635410802610723e-07,
"logits/chosen": -1.2080810070037842,
"logits/rejected": -1.1687798500061035,
"logps/chosen": -0.8889066576957703,
"logps/rejected": -1.4597949981689453,
"loss": 1.9215,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -8.889066696166992,
"rewards/margins": 5.708883762359619,
"rewards/rejected": -14.59795093536377,
"semantic_entropy": 0.903703510761261,
"step": 290
},
{
"epoch": 0.644808743169399,
"grad_norm": 52.105468651247094,
"learning_rate": 8.569153053160428e-07,
"logits/chosen": -1.1924866437911987,
"logits/rejected": -1.182565689086914,
"logps/chosen": -0.9297744035720825,
"logps/rejected": -1.5572900772094727,
"loss": 1.8847,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -9.29774284362793,
"rewards/margins": 6.2751569747924805,
"rewards/rejected": -15.572900772094727,
"semantic_entropy": 0.8886201977729797,
"step": 295
},
{
"epoch": 0.6557377049180327,
"grad_norm": 43.97404227922028,
"learning_rate": 8.501591992667849e-07,
"logits/chosen": -1.2417964935302734,
"logits/rejected": -1.2167500257492065,
"logps/chosen": -0.9788614511489868,
"logps/rejected": -1.5977232456207275,
"loss": 1.9048,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -9.788614273071289,
"rewards/margins": 6.188617706298828,
"rewards/rejected": -15.977231979370117,
"semantic_entropy": 0.8578527569770813,
"step": 300
},
{
"epoch": 0.6666666666666666,
"grad_norm": 62.519761231188205,
"learning_rate": 8.432752291713058e-07,
"logits/chosen": -1.227373719215393,
"logits/rejected": -1.1630009412765503,
"logps/chosen": -0.9313735961914062,
"logps/rejected": -1.6220667362213135,
"loss": 1.876,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -9.313735008239746,
"rewards/margins": 6.906930446624756,
"rewards/rejected": -16.220666885375977,
"semantic_entropy": 0.8703945875167847,
"step": 305
},
{
"epoch": 0.6775956284153005,
"grad_norm": 47.74676931324823,
"learning_rate": 8.362659087784152e-07,
"logits/chosen": -1.1420575380325317,
"logits/rejected": -1.1442222595214844,
"logps/chosen": -0.921275794506073,
"logps/rejected": -1.5964065790176392,
"loss": 1.9255,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -9.212759017944336,
"rewards/margins": 6.751306056976318,
"rewards/rejected": -15.964065551757812,
"semantic_entropy": 0.8867815732955933,
"step": 310
},
{
"epoch": 0.6885245901639344,
"grad_norm": 48.12633140725401,
"learning_rate": 8.291337976098067e-07,
"logits/chosen": -1.1699371337890625,
"logits/rejected": -1.1596167087554932,
"logps/chosen": -0.9925182461738586,
"logps/rejected": -1.4757254123687744,
"loss": 1.8872,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -9.925182342529297,
"rewards/margins": 4.832071781158447,
"rewards/rejected": -14.757253646850586,
"semantic_entropy": 0.8734658360481262,
"step": 315
},
{
"epoch": 0.6994535519125683,
"grad_norm": 47.1038569824555,
"learning_rate": 8.218815000254231e-07,
"logits/chosen": -1.2591969966888428,
"logits/rejected": -1.1927886009216309,
"logps/chosen": -0.8629493713378906,
"logps/rejected": -1.4769127368927002,
"loss": 1.8067,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -8.629494667053223,
"rewards/margins": 6.139632225036621,
"rewards/rejected": -14.769126892089844,
"semantic_entropy": 0.9108262062072754,
"step": 320
},
{
"epoch": 0.7103825136612022,
"grad_norm": 56.67465709928985,
"learning_rate": 8.145116642724485e-07,
"logits/chosen": -1.2181096076965332,
"logits/rejected": -1.189969778060913,
"logps/chosen": -0.8706620335578918,
"logps/rejected": -1.4245946407318115,
"loss": 1.8061,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -8.706620216369629,
"rewards/margins": 5.539328098297119,
"rewards/rejected": -14.245946884155273,
"semantic_entropy": 0.893680214881897,
"step": 325
},
{
"epoch": 0.7213114754098361,
"grad_norm": 43.692074758430785,
"learning_rate": 8.07026981518276e-07,
"logits/chosen": -1.1343576908111572,
"logits/rejected": -1.0772193670272827,
"logps/chosen": -0.8813779950141907,
"logps/rejected": -1.7738568782806396,
"loss": 1.7373,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -8.813779830932617,
"rewards/margins": 8.924787521362305,
"rewards/rejected": -17.73856544494629,
"semantic_entropy": 0.8537489771842957,
"step": 330
},
{
"epoch": 0.73224043715847,
"grad_norm": 54.41817403205364,
"learning_rate": 7.994301848678004e-07,
"logits/chosen": -1.134152889251709,
"logits/rejected": -1.063077449798584,
"logps/chosen": -0.9365140199661255,
"logps/rejected": -1.6991326808929443,
"loss": 1.766,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -9.365139961242676,
"rewards/margins": 7.626187324523926,
"rewards/rejected": -16.9913272857666,
"semantic_entropy": 0.8437296152114868,
"step": 335
},
{
"epoch": 0.7431693989071039,
"grad_norm": 56.714537939738605,
"learning_rate": 7.917240483654e-07,
"logits/chosen": -1.1386888027191162,
"logits/rejected": -1.0677882432937622,
"logps/chosen": -0.9699214100837708,
"logps/rejected": -1.7819700241088867,
"loss": 1.8199,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -9.699213981628418,
"rewards/margins": 8.12048625946045,
"rewards/rejected": -17.819698333740234,
"semantic_entropy": 0.8428508639335632,
"step": 340
},
{
"epoch": 0.7540983606557377,
"grad_norm": 54.15768742157569,
"learning_rate": 7.839113859819656e-07,
"logits/chosen": -1.2082730531692505,
"logits/rejected": -1.1757750511169434,
"logps/chosen": -1.0214024782180786,
"logps/rejected": -1.8994626998901367,
"loss": 1.8236,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -10.214024543762207,
"rewards/margins": 8.78060245513916,
"rewards/rejected": -18.994626998901367,
"semantic_entropy": 0.818555474281311,
"step": 345
},
{
"epoch": 0.7650273224043715,
"grad_norm": 52.04532684140525,
"learning_rate": 7.759950505873521e-07,
"logits/chosen": -1.2180219888687134,
"logits/rejected": -1.1834783554077148,
"logps/chosen": -0.7670449018478394,
"logps/rejected": -1.324202060699463,
"loss": 1.7353,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -7.670449256896973,
"rewards/margins": 5.571571350097656,
"rewards/rejected": -13.242021560668945,
"semantic_entropy": 0.9124476313591003,
"step": 350
},
{
"epoch": 0.7759562841530054,
"grad_norm": 52.511907795888796,
"learning_rate": 7.67977932908626e-07,
"logits/chosen": -1.175022840499878,
"logits/rejected": -1.1130549907684326,
"logps/chosen": -0.8713346719741821,
"logps/rejected": -1.66217839717865,
"loss": 1.726,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -8.713346481323242,
"rewards/margins": 7.908437252044678,
"rewards/rejected": -16.621784210205078,
"semantic_entropy": 0.8560686111450195,
"step": 355
},
{
"epoch": 0.7868852459016393,
"grad_norm": 47.66801579095495,
"learning_rate": 7.598629604744872e-07,
"logits/chosen": -1.1504714488983154,
"logits/rejected": -1.121519923210144,
"logps/chosen": -1.078308343887329,
"logps/rejected": -2.017784833908081,
"loss": 1.687,
"rewards/accuracies": 0.8125,
"rewards/chosen": -10.783082008361816,
"rewards/margins": 9.394767761230469,
"rewards/rejected": -20.17784881591797,
"semantic_entropy": 0.8011868596076965,
"step": 360
},
{
"epoch": 0.7978142076502732,
"grad_norm": 78.73352396462461,
"learning_rate": 7.516530965462539e-07,
"logits/chosen": -1.2399051189422607,
"logits/rejected": -1.2221591472625732,
"logps/chosen": -0.869607150554657,
"logps/rejected": -1.7532609701156616,
"loss": 1.6969,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -8.696072578430176,
"rewards/margins": 8.836538314819336,
"rewards/rejected": -17.532609939575195,
"semantic_entropy": 0.8715127110481262,
"step": 365
},
{
"epoch": 0.8087431693989071,
"grad_norm": 52.51768985735217,
"learning_rate": 7.433513390357989e-07,
"logits/chosen": -1.2507340908050537,
"logits/rejected": -1.187475562095642,
"logps/chosen": -0.9717696905136108,
"logps/rejected": -2.0153520107269287,
"loss": 1.6488,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -9.717697143554688,
"rewards/margins": 10.435824394226074,
"rewards/rejected": -20.153522491455078,
"semantic_entropy": 0.8269231915473938,
"step": 370
},
{
"epoch": 0.819672131147541,
"grad_norm": 50.10941942498599,
"learning_rate": 7.349607194108322e-07,
"logits/chosen": -1.2848598957061768,
"logits/rejected": -1.1889159679412842,
"logps/chosen": -0.8790639638900757,
"logps/rejected": -1.7771461009979248,
"loss": 1.6703,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -8.790639877319336,
"rewards/margins": 8.980820655822754,
"rewards/rejected": -17.771459579467773,
"semantic_entropy": 0.853074848651886,
"step": 375
},
{
"epoch": 0.8306010928961749,
"grad_norm": 45.566081133100745,
"learning_rate": 7.264843015879321e-07,
"logits/chosen": -1.1421478986740112,
"logits/rejected": -1.140625238418579,
"logps/chosen": -0.9042370915412903,
"logps/rejected": -1.7280666828155518,
"loss": 1.541,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -9.04237174987793,
"rewards/margins": 8.23829460144043,
"rewards/rejected": -17.280664443969727,
"semantic_entropy": 0.8745312690734863,
"step": 380
},
{
"epoch": 0.8415300546448088,
"grad_norm": 59.00085660352214,
"learning_rate": 7.17925180813725e-07,
"logits/chosen": -1.2217355966567993,
"logits/rejected": -1.159557580947876,
"logps/chosen": -1.042198657989502,
"logps/rejected": -2.1717679500579834,
"loss": 1.7473,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -10.42198657989502,
"rewards/margins": 11.295695304870605,
"rewards/rejected": -21.717683792114258,
"semantic_entropy": 0.8145696520805359,
"step": 385
},
{
"epoch": 0.8524590163934426,
"grad_norm": 68.24919118342267,
"learning_rate": 7.092864825346266e-07,
"logits/chosen": -1.2256710529327393,
"logits/rejected": -1.154592752456665,
"logps/chosen": -0.8894011378288269,
"logps/rejected": -2.0597283840179443,
"loss": 1.5906,
"rewards/accuracies": 0.875,
"rewards/chosen": -8.894010543823242,
"rewards/margins": 11.703274726867676,
"rewards/rejected": -20.597286224365234,
"semantic_entropy": 0.8356989026069641,
"step": 390
},
{
"epoch": 0.8633879781420765,
"grad_norm": 52.86840793380424,
"learning_rate": 7.005713612555545e-07,
"logits/chosen": -1.1973850727081299,
"logits/rejected": -1.15791654586792,
"logps/chosen": -0.9084303975105286,
"logps/rejected": -1.824072241783142,
"loss": 1.5811,
"rewards/accuracies": 0.84375,
"rewards/chosen": -9.084303855895996,
"rewards/margins": 9.156417846679688,
"rewards/rejected": -18.240720748901367,
"semantic_entropy": 0.863986611366272,
"step": 395
},
{
"epoch": 0.8743169398907104,
"grad_norm": 54.969346083508704,
"learning_rate": 6.917829993880302e-07,
"logits/chosen": -1.1350136995315552,
"logits/rejected": -1.078984022140503,
"logps/chosen": -0.9205960035324097,
"logps/rejected": -1.9763364791870117,
"loss": 1.5778,
"rewards/accuracies": 0.875,
"rewards/chosen": -9.205960273742676,
"rewards/margins": 10.557405471801758,
"rewards/rejected": -19.763364791870117,
"semantic_entropy": 0.8187274932861328,
"step": 400
},
{
"epoch": 0.8743169398907104,
"eval_logits/chosen": -1.5077557563781738,
"eval_logits/rejected": -1.432308554649353,
"eval_logps/chosen": -0.868651807308197,
"eval_logps/rejected": -1.8860282897949219,
"eval_loss": 1.6372781991958618,
"eval_rewards/accuracies": 0.8734939694404602,
"eval_rewards/chosen": -8.686517715454102,
"eval_rewards/margins": 10.173765182495117,
"eval_rewards/rejected": -18.86028289794922,
"eval_runtime": 37.7445,
"eval_samples_per_second": 34.919,
"eval_semantic_entropy": 0.8519198894500732,
"eval_steps_per_second": 2.199,
"step": 400
},
{
"epoch": 0.8852459016393442,
"grad_norm": 54.747379817385166,
"learning_rate": 6.8292460608809e-07,
"logits/chosen": -1.1865565776824951,
"logits/rejected": -1.0789119005203247,
"logps/chosen": -0.8656112551689148,
"logps/rejected": -1.9079488515853882,
"loss": 1.557,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -8.656112670898438,
"rewards/margins": 10.423376083374023,
"rewards/rejected": -19.07948875427246,
"semantic_entropy": 0.8483451008796692,
"step": 405
},
{
"epoch": 0.8961748633879781,
"grad_norm": 54.38709320329884,
"learning_rate": 6.739994160844309e-07,
"logits/chosen": -1.2001937627792358,
"logits/rejected": -1.2109323740005493,
"logps/chosen": -1.0198501348495483,
"logps/rejected": -2.304253101348877,
"loss": 1.5398,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -10.198502540588379,
"rewards/margins": 12.844027519226074,
"rewards/rejected": -23.042530059814453,
"semantic_entropy": 0.7884197235107422,
"step": 410
},
{
"epoch": 0.907103825136612,
"grad_norm": 58.8994587847891,
"learning_rate": 6.650106884972176e-07,
"logits/chosen": -1.2297394275665283,
"logits/rejected": -1.2055060863494873,
"logps/chosen": -0.8097732663154602,
"logps/rejected": -2.0647740364074707,
"loss": 1.6318,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -8.097734451293945,
"rewards/margins": 12.550005912780762,
"rewards/rejected": -20.647741317749023,
"semantic_entropy": 0.8577386736869812,
"step": 415
},
{
"epoch": 0.9180327868852459,
"grad_norm": 66.32923235150443,
"learning_rate": 6.559617056479827e-07,
"logits/chosen": -1.2397379875183105,
"logits/rejected": -1.1944515705108643,
"logps/chosen": -0.9744995832443237,
"logps/rejected": -2.2359464168548584,
"loss": 1.5364,
"rewards/accuracies": 0.84375,
"rewards/chosen": -9.7449951171875,
"rewards/margins": 12.614469528198242,
"rewards/rejected": -22.359464645385742,
"semantic_entropy": 0.8098868131637573,
"step": 420
},
{
"epoch": 0.9289617486338798,
"grad_norm": 64.90064469639756,
"learning_rate": 6.468557718610559e-07,
"logits/chosen": -1.2209162712097168,
"logits/rejected": -1.169478178024292,
"logps/chosen": -1.0786913633346558,
"logps/rejected": -2.5019688606262207,
"loss": 1.6058,
"rewards/accuracies": 0.84375,
"rewards/chosen": -10.786913871765137,
"rewards/margins": 14.232770919799805,
"rewards/rejected": -25.019685745239258,
"semantic_entropy": 0.7745442390441895,
"step": 425
},
{
"epoch": 0.9398907103825137,
"grad_norm": 65.90460986634548,
"learning_rate": 6.376962122569567e-07,
"logits/chosen": -1.1558514833450317,
"logits/rejected": -1.1550347805023193,
"logps/chosen": -0.6848023533821106,
"logps/rejected": -1.8477531671524048,
"loss": 1.3787,
"rewards/accuracies": 0.9375,
"rewards/chosen": -6.848023414611816,
"rewards/margins": 11.629508972167969,
"rewards/rejected": -18.4775333404541,
"semantic_entropy": 0.8978629112243652,
"step": 430
},
{
"epoch": 0.9508196721311475,
"grad_norm": 80.36478809238143,
"learning_rate": 6.284863715381948e-07,
"logits/chosen": -1.2516933679580688,
"logits/rejected": -1.2447582483291626,
"logps/chosen": -0.8717735409736633,
"logps/rejected": -2.2636890411376953,
"loss": 1.5367,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -8.717737197875977,
"rewards/margins": 13.919151306152344,
"rewards/rejected": -22.63688850402832,
"semantic_entropy": 0.8273345828056335,
"step": 435
},
{
"epoch": 0.9617486338797814,
"grad_norm": 79.39000046120883,
"learning_rate": 6.192296127679192e-07,
"logits/chosen": -1.1874706745147705,
"logits/rejected": -1.1192582845687866,
"logps/chosen": -0.9044081568717957,
"logps/rejected": -2.0115015506744385,
"loss": 1.5428,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -9.04408073425293,
"rewards/margins": 11.070935249328613,
"rewards/rejected": -20.11501693725586,
"semantic_entropy": 0.8257206082344055,
"step": 440
},
{
"epoch": 0.9726775956284153,
"grad_norm": 59.45278594899511,
"learning_rate": 6.099293161418629e-07,
"logits/chosen": -1.2240984439849854,
"logits/rejected": -1.18662428855896,
"logps/chosen": -0.6975774168968201,
"logps/rejected": -1.919647216796875,
"loss": 1.5818,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -6.97577428817749,
"rewards/margins": 12.220699310302734,
"rewards/rejected": -19.196474075317383,
"semantic_entropy": 0.887184739112854,
"step": 445
},
{
"epoch": 0.9836065573770492,
"grad_norm": 53.56869631451961,
"learning_rate": 6.005888777540319e-07,
"logits/chosen": -1.1677896976470947,
"logits/rejected": -1.1477397680282593,
"logps/chosen": -0.8627035021781921,
"logps/rejected": -1.9724452495574951,
"loss": 1.5352,
"rewards/accuracies": 0.875,
"rewards/chosen": -8.627036094665527,
"rewards/margins": 11.097416877746582,
"rewards/rejected": -19.72445297241211,
"semantic_entropy": 0.8503534197807312,
"step": 450
},
{
"epoch": 0.994535519125683,
"grad_norm": 75.11227313091236,
"learning_rate": 5.912117083565873e-07,
"logits/chosen": -1.1938502788543701,
"logits/rejected": -1.1654444932937622,
"logps/chosen": -1.1713725328445435,
"logps/rejected": -2.3690249919891357,
"loss": 1.5941,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -11.713726043701172,
"rewards/margins": 11.976524353027344,
"rewards/rejected": -23.690250396728516,
"semantic_entropy": 0.7848092913627625,
"step": 455
},
{
"epoch": 1.005464480874317,
"grad_norm": 48.44167575969943,
"learning_rate": 5.818012321143773e-07,
"logits/chosen": -1.2322055101394653,
"logits/rejected": -1.1756855249404907,
"logps/chosen": -0.8835703730583191,
"logps/rejected": -2.2671618461608887,
"loss": 1.3987,
"rewards/accuracies": 0.875,
"rewards/chosen": -8.835702896118164,
"rewards/margins": 13.835916519165039,
"rewards/rejected": -22.671619415283203,
"semantic_entropy": 0.8247418403625488,
"step": 460
},
{
"epoch": 1.0163934426229508,
"grad_norm": 47.683623908009125,
"learning_rate": 5.723608853545684e-07,
"logits/chosen": -1.2683448791503906,
"logits/rejected": -1.2093217372894287,
"logps/chosen": -0.8307113647460938,
"logps/rejected": -2.3884284496307373,
"loss": 1.1472,
"rewards/accuracies": 0.9375,
"rewards/chosen": -8.307112693786621,
"rewards/margins": 15.577173233032227,
"rewards/rejected": -23.88428497314453,
"semantic_entropy": 0.8331409692764282,
"step": 465
},
{
"epoch": 1.0273224043715847,
"grad_norm": 57.239399331005785,
"learning_rate": 5.628941153118388e-07,
"logits/chosen": -1.2552951574325562,
"logits/rejected": -1.2222687005996704,
"logps/chosen": -0.8629674911499023,
"logps/rejected": -2.325558662414551,
"loss": 1.1426,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -8.629674911499023,
"rewards/margins": 14.625910758972168,
"rewards/rejected": -23.25558853149414,
"semantic_entropy": 0.8217577934265137,
"step": 470
},
{
"epoch": 1.0382513661202186,
"grad_norm": 39.48804343487935,
"learning_rate": 5.534043788695852e-07,
"logits/chosen": -1.22693932056427,
"logits/rejected": -1.1497706174850464,
"logps/chosen": -0.7519802451133728,
"logps/rejected": -2.1450114250183105,
"loss": 1.0975,
"rewards/accuracies": 0.9375,
"rewards/chosen": -7.519803047180176,
"rewards/margins": 13.930310249328613,
"rewards/rejected": -21.450115203857422,
"semantic_entropy": 0.8537012338638306,
"step": 475
},
{
"epoch": 1.0491803278688525,
"grad_norm": 37.024988536485964,
"learning_rate": 5.438951412976098e-07,
"logits/chosen": -1.3238413333892822,
"logits/rejected": -1.2577579021453857,
"logps/chosen": -0.7658538818359375,
"logps/rejected": -2.0598320960998535,
"loss": 1.1533,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -7.658538818359375,
"rewards/margins": 12.939779281616211,
"rewards/rejected": -20.598318099975586,
"semantic_entropy": 0.8649771809577942,
"step": 480
},
{
"epoch": 1.0601092896174864,
"grad_norm": 42.1526889978167,
"learning_rate": 5.34369874986742e-07,
"logits/chosen": -1.2668297290802002,
"logits/rejected": -1.1939513683319092,
"logps/chosen": -0.8974517583847046,
"logps/rejected": -2.424004077911377,
"loss": 1.0247,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -8.974517822265625,
"rewards/margins": 15.265522956848145,
"rewards/rejected": -24.24004364013672,
"semantic_entropy": 0.7897659540176392,
"step": 485
},
{
"epoch": 1.0710382513661203,
"grad_norm": 52.525378226092165,
"learning_rate": 5.248320581808619e-07,
"logits/chosen": -1.2010338306427002,
"logits/rejected": -1.1409817934036255,
"logps/chosen": -0.7397095561027527,
"logps/rejected": -2.3880066871643066,
"loss": 1.1343,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -7.397095680236816,
"rewards/margins": 16.48297119140625,
"rewards/rejected": -23.88006591796875,
"semantic_entropy": 0.8509289026260376,
"step": 490
},
{
"epoch": 1.0819672131147542,
"grad_norm": 57.24209028140043,
"learning_rate": 5.15285173706785e-07,
"logits/chosen": -1.2966060638427734,
"logits/rejected": -1.2440364360809326,
"logps/chosen": -0.7074769139289856,
"logps/rejected": -2.2080492973327637,
"loss": 1.104,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -7.074769496917725,
"rewards/margins": 15.00572681427002,
"rewards/rejected": -22.080495834350586,
"semantic_entropy": 0.862097442150116,
"step": 495
},
{
"epoch": 1.092896174863388,
"grad_norm": 60.20969441966712,
"learning_rate": 5.057327077024744e-07,
"logits/chosen": -1.31562340259552,
"logits/rejected": -1.2055505514144897,
"logps/chosen": -0.7696375846862793,
"logps/rejected": -2.1600234508514404,
"loss": 1.0776,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -7.696375370025635,
"rewards/margins": 13.903857231140137,
"rewards/rejected": -21.600234985351562,
"semantic_entropy": 0.8503168821334839,
"step": 500
},
{
"epoch": 1.1038251366120218,
"grad_norm": 39.37970422474807,
"learning_rate": 4.961781483440433e-07,
"logits/chosen": -1.2652629613876343,
"logits/rejected": -1.155110239982605,
"logps/chosen": -0.7121917009353638,
"logps/rejected": -2.2156224250793457,
"loss": 1.0684,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -7.1219162940979,
"rewards/margins": 15.034309387207031,
"rewards/rejected": -22.156227111816406,
"semantic_entropy": 0.856345534324646,
"step": 505
},
{
"epoch": 1.1147540983606556,
"grad_norm": 53.63055077579748,
"learning_rate": 4.866249845720132e-07,
"logits/chosen": -1.2122000455856323,
"logits/rejected": -1.1381186246871948,
"logps/chosen": -0.7895854115486145,
"logps/rejected": -2.1967644691467285,
"loss": 1.1991,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -7.8958539962768555,
"rewards/margins": 14.071792602539062,
"rewards/rejected": -21.96764373779297,
"semantic_entropy": 0.8369362950325012,
"step": 510
},
{
"epoch": 1.1256830601092895,
"grad_norm": 45.3883880528581,
"learning_rate": 4.770767048172948e-07,
"logits/chosen": -1.2122347354888916,
"logits/rejected": -1.149927020072937,
"logps/chosen": -0.7574501633644104,
"logps/rejected": -2.262672185897827,
"loss": 1.0855,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -7.574501037597656,
"rewards/margins": 15.052220344543457,
"rewards/rejected": -22.62672233581543,
"semantic_entropy": 0.8394317626953125,
"step": 515
},
{
"epoch": 1.1366120218579234,
"grad_norm": 40.766203312385706,
"learning_rate": 4.675367957273505e-07,
"logits/chosen": -1.2204854488372803,
"logits/rejected": -1.144971251487732,
"logps/chosen": -0.7849557995796204,
"logps/rejected": -2.2667272090911865,
"loss": 1.0264,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -7.849558353424072,
"rewards/margins": 14.817715644836426,
"rewards/rejected": -22.66727066040039,
"semantic_entropy": 0.8283472061157227,
"step": 520
},
{
"epoch": 1.1475409836065573,
"grad_norm": 42.8963401742162,
"learning_rate": 4.5800874089301455e-07,
"logits/chosen": -1.261281132698059,
"logits/rejected": -1.1677086353302002,
"logps/chosen": -0.7403801679611206,
"logps/rejected": -2.290158987045288,
"loss": 0.9619,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -7.403802394866943,
"rewards/margins": 15.497787475585938,
"rewards/rejected": -22.901592254638672,
"semantic_entropy": 0.8431955575942993,
"step": 525
},
{
"epoch": 1.1584699453551912,
"grad_norm": 57.97538998117191,
"learning_rate": 4.4849601957642285e-07,
"logits/chosen": -1.174661636352539,
"logits/rejected": -1.115818738937378,
"logps/chosen": -0.7541646361351013,
"logps/rejected": -2.2110159397125244,
"loss": 1.0935,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -7.5416460037231445,
"rewards/margins": 14.568511962890625,
"rewards/rejected": -22.110157012939453,
"semantic_entropy": 0.853602409362793,
"step": 530
},
{
"epoch": 1.169398907103825,
"grad_norm": 56.26607000357583,
"learning_rate": 4.390021054405286e-07,
"logits/chosen": -1.240636944770813,
"logits/rejected": -1.1869792938232422,
"logps/chosen": -0.7534674406051636,
"logps/rejected": -2.2876932621002197,
"loss": 0.9657,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -7.534674167633057,
"rewards/margins": 15.342257499694824,
"rewards/rejected": -22.87693214416504,
"semantic_entropy": 0.8402601480484009,
"step": 535
},
{
"epoch": 1.180327868852459,
"grad_norm": 54.25638397035917,
"learning_rate": 4.295304652806592e-07,
"logits/chosen": -1.2079153060913086,
"logits/rejected": -1.142287015914917,
"logps/chosen": -0.611890971660614,
"logps/rejected": -2.0176615715026855,
"loss": 1.0051,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -6.11890983581543,
"rewards/margins": 14.057706832885742,
"rewards/rejected": -20.17661476135254,
"semantic_entropy": 0.8606586456298828,
"step": 540
},
{
"epoch": 1.1912568306010929,
"grad_norm": 44.34686440564056,
"learning_rate": 4.200845577585826e-07,
"logits/chosen": -1.2312743663787842,
"logits/rejected": -1.1274607181549072,
"logps/chosen": -0.6904948353767395,
"logps/rejected": -2.0026180744171143,
"loss": 1.0628,
"rewards/accuracies": 0.9375,
"rewards/chosen": -6.9049482345581055,
"rewards/margins": 13.121232986450195,
"rewards/rejected": -20.026180267333984,
"semantic_entropy": 0.839868426322937,
"step": 545
},
{
"epoch": 1.2021857923497268,
"grad_norm": 51.975486510086114,
"learning_rate": 4.106678321395433e-07,
"logits/chosen": -1.1899176836013794,
"logits/rejected": -1.1200889348983765,
"logps/chosen": -0.7009586095809937,
"logps/rejected": -2.399099826812744,
"loss": 0.9114,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -7.009586334228516,
"rewards/margins": 16.98141098022461,
"rewards/rejected": -23.990997314453125,
"semantic_entropy": 0.8362213373184204,
"step": 550
},
{
"epoch": 1.2131147540983607,
"grad_norm": 39.51029900614786,
"learning_rate": 4.012837270327288e-07,
"logits/chosen": -1.1518226861953735,
"logits/rejected": -1.1040208339691162,
"logps/chosen": -0.6657946705818176,
"logps/rejected": -2.024448871612549,
"loss": 1.0111,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -6.6579461097717285,
"rewards/margins": 13.586542129516602,
"rewards/rejected": -20.244487762451172,
"semantic_entropy": 0.8607606887817383,
"step": 555
},
{
"epoch": 1.2240437158469946,
"grad_norm": 47.0785790742371,
"learning_rate": 3.9193566913562915e-07,
"logits/chosen": -1.2187812328338623,
"logits/rejected": -1.1253793239593506,
"logps/chosen": -0.8078786730766296,
"logps/rejected": -2.1750519275665283,
"loss": 1.0263,
"rewards/accuracies": 0.90625,
"rewards/chosen": -8.078786849975586,
"rewards/margins": 13.671732902526855,
"rewards/rejected": -21.750518798828125,
"semantic_entropy": 0.8194610476493835,
"step": 560
},
{
"epoch": 1.2349726775956285,
"grad_norm": 44.31080037447064,
"learning_rate": 3.826270719827435e-07,
"logits/chosen": -1.2184025049209595,
"logits/rejected": -1.1244232654571533,
"logps/chosen": -0.7781059741973877,
"logps/rejected": -2.595242977142334,
"loss": 1.0496,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -7.781059265136719,
"rewards/margins": 18.171369552612305,
"rewards/rejected": -25.952428817749023,
"semantic_entropy": 0.8032097816467285,
"step": 565
},
{
"epoch": 1.2459016393442623,
"grad_norm": 57.680087176882985,
"learning_rate": 3.7336133469909623e-07,
"logits/chosen": -1.262069821357727,
"logits/rejected": -1.203547477722168,
"logps/chosen": -0.7461926341056824,
"logps/rejected": -2.1672732830047607,
"loss": 1.1028,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -7.461926460266113,
"rewards/margins": 14.210809707641602,
"rewards/rejected": -21.6727352142334,
"semantic_entropy": 0.8577653169631958,
"step": 570
},
{
"epoch": 1.2568306010928962,
"grad_norm": 46.59857147414731,
"learning_rate": 3.64141840759012e-07,
"logits/chosen": -1.1375811100006104,
"logits/rejected": -1.0560975074768066,
"logps/chosen": -0.6888304948806763,
"logps/rejected": -2.229635238647461,
"loss": 0.9418,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -6.8883056640625,
"rewards/margins": 15.408047676086426,
"rewards/rejected": -22.29635238647461,
"semantic_entropy": 0.8547189831733704,
"step": 575
},
{
"epoch": 1.2677595628415301,
"grad_norm": 70.16919238335676,
"learning_rate": 3.549719567506076e-07,
"logits/chosen": -1.1417677402496338,
"logits/rejected": -1.1007084846496582,
"logps/chosen": -0.746972918510437,
"logps/rejected": -2.0715861320495605,
"loss": 0.9986,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -7.469728946685791,
"rewards/margins": 13.246131896972656,
"rewards/rejected": -20.715862274169922,
"semantic_entropy": 0.8440540432929993,
"step": 580
},
{
"epoch": 1.278688524590164,
"grad_norm": 39.105942102294286,
"learning_rate": 3.4585503114644996e-07,
"logits/chosen": -1.2692724466323853,
"logits/rejected": -1.1571121215820312,
"logps/chosen": -0.7609504461288452,
"logps/rejected": -2.3702054023742676,
"loss": 1.0065,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -7.609503746032715,
"rewards/margins": 16.092552185058594,
"rewards/rejected": -23.70205307006836,
"semantic_entropy": 0.8199702501296997,
"step": 585
},
{
"epoch": 1.289617486338798,
"grad_norm": 35.90062312874268,
"learning_rate": 3.3679439308082774e-07,
"logits/chosen": -1.226792335510254,
"logits/rejected": -1.176424264907837,
"logps/chosen": -0.6281425356864929,
"logps/rejected": -2.045499324798584,
"loss": 0.9731,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -6.281425952911377,
"rewards/margins": 14.173568725585938,
"rewards/rejected": -20.454992294311523,
"semantic_entropy": 0.8588122129440308,
"step": 590
},
{
"epoch": 1.3005464480874318,
"grad_norm": 52.95598574128885,
"learning_rate": 3.2779335113408646e-07,
"logits/chosen": -1.233185052871704,
"logits/rejected": -1.1640207767486572,
"logps/chosen": -0.7508488297462463,
"logps/rejected": -2.4652957916259766,
"loss": 1.0038,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -7.508486747741699,
"rewards/margins": 17.14447021484375,
"rewards/rejected": -24.6529598236084,
"semantic_entropy": 0.8177651166915894,
"step": 595
},
{
"epoch": 1.3114754098360657,
"grad_norm": 40.0562568923892,
"learning_rate": 3.1885519212446716e-07,
"logits/chosen": -1.2854266166687012,
"logits/rejected": -1.177534580230713,
"logps/chosen": -0.6793255805969238,
"logps/rejected": -2.2706198692321777,
"loss": 0.9506,
"rewards/accuracies": 0.96875,
"rewards/chosen": -6.7932562828063965,
"rewards/margins": 15.912942886352539,
"rewards/rejected": -22.70619773864746,
"semantic_entropy": 0.8524688482284546,
"step": 600
},
{
"epoch": 1.3224043715846996,
"grad_norm": 56.41638001057574,
"learning_rate": 3.0998317990789376e-07,
"logits/chosen": -1.2670646905899048,
"logits/rejected": -1.171144962310791,
"logps/chosen": -0.6692796349525452,
"logps/rejected": -1.934456467628479,
"loss": 1.0026,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -6.692796230316162,
"rewards/margins": 12.65176773071289,
"rewards/rejected": -19.34456443786621,
"semantic_entropy": 0.869337260723114,
"step": 605
},
{
"epoch": 1.3333333333333333,
"grad_norm": 47.30494559887427,
"learning_rate": 3.0118055418614295e-07,
"logits/chosen": -1.3104336261749268,
"logits/rejected": -1.213578224182129,
"logps/chosen": -0.8171396255493164,
"logps/rejected": -2.5085349082946777,
"loss": 0.9846,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -8.17139720916748,
"rewards/margins": 16.91395378112793,
"rewards/rejected": -25.085350036621094,
"semantic_entropy": 0.7933089733123779,
"step": 610
},
{
"epoch": 1.3442622950819672,
"grad_norm": 55.65069108222119,
"learning_rate": 2.9245052932383707e-07,
"logits/chosen": -1.2602143287658691,
"logits/rejected": -1.1212416887283325,
"logps/chosen": -0.7733426094055176,
"logps/rejected": -2.3373031616210938,
"loss": 1.0585,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -7.733426570892334,
"rewards/margins": 15.639605522155762,
"rewards/rejected": -23.37303352355957,
"semantic_entropy": 0.8259070515632629,
"step": 615
},
{
"epoch": 1.355191256830601,
"grad_norm": 41.83594828022189,
"learning_rate": 2.83796293174686e-07,
"logits/chosen": -1.1642497777938843,
"logits/rejected": -1.0947132110595703,
"logps/chosen": -0.7484847903251648,
"logps/rejected": -2.3808321952819824,
"loss": 1.0132,
"rewards/accuracies": 0.9375,
"rewards/chosen": -7.4848480224609375,
"rewards/margins": 16.323474884033203,
"rewards/rejected": -23.808320999145508,
"semantic_entropy": 0.8322114944458008,
"step": 620
},
{
"epoch": 1.366120218579235,
"grad_norm": 45.85253729227267,
"learning_rate": 2.7522100591741217e-07,
"logits/chosen": -1.234703779220581,
"logits/rejected": -1.1591752767562866,
"logps/chosen": -0.6658716201782227,
"logps/rejected": -2.3456645011901855,
"loss": 0.9989,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -6.658716678619385,
"rewards/margins": 16.797927856445312,
"rewards/rejected": -23.45664405822754,
"semantic_entropy": 0.8470379710197449,
"step": 625
},
{
"epoch": 1.3770491803278688,
"grad_norm": 47.20204057866064,
"learning_rate": 2.6672779890178046e-07,
"logits/chosen": -1.163450002670288,
"logits/rejected": -1.0469523668289185,
"logps/chosen": -0.7807295918464661,
"logps/rejected": -2.2187490463256836,
"loss": 1.0123,
"rewards/accuracies": 0.9375,
"rewards/chosen": -7.807295322418213,
"rewards/margins": 14.380197525024414,
"rewards/rejected": -22.1874942779541,
"semantic_entropy": 0.829529881477356,
"step": 630
},
{
"epoch": 1.3879781420765027,
"grad_norm": 48.43553604807009,
"learning_rate": 2.5831977350515454e-07,
"logits/chosen": -1.1149486303329468,
"logits/rejected": -1.0645884275436401,
"logps/chosen": -0.7764806747436523,
"logps/rejected": -2.346562385559082,
"loss": 1.0361,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -7.764806270599365,
"rewards/margins": 15.700818061828613,
"rewards/rejected": -23.465625762939453,
"semantic_entropy": 0.8258574604988098,
"step": 635
},
{
"epoch": 1.3989071038251366,
"grad_norm": 50.20523377491874,
"learning_rate": 2.500000000000001e-07,
"logits/chosen": -1.2106841802597046,
"logits/rejected": -1.164466142654419,
"logps/chosen": -0.7233768105506897,
"logps/rejected": -2.620008945465088,
"loss": 0.932,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -7.233767032623291,
"rewards/margins": 18.966323852539062,
"rewards/rejected": -26.200092315673828,
"semantic_entropy": 0.8185870051383972,
"step": 640
},
{
"epoch": 1.4098360655737705,
"grad_norm": 50.15293641915176,
"learning_rate": 2.4177151643274307e-07,
"logits/chosen": -1.1696977615356445,
"logits/rejected": -1.112188458442688,
"logps/chosen": -0.7105950117111206,
"logps/rejected": -2.4047422409057617,
"loss": 0.9626,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -7.105950832366943,
"rewards/margins": 16.941471099853516,
"rewards/rejected": -24.047422409057617,
"semantic_entropy": 0.8116961717605591,
"step": 645
},
{
"epoch": 1.4207650273224044,
"grad_norm": 52.051060632639825,
"learning_rate": 2.3363732751439923e-07,
"logits/chosen": -1.2659627199172974,
"logits/rejected": -1.178022027015686,
"logps/chosen": -0.7824967503547668,
"logps/rejected": -2.2903237342834473,
"loss": 1.0342,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -7.824967861175537,
"rewards/margins": 15.078269958496094,
"rewards/rejected": -22.903236389160156,
"semantic_entropy": 0.8222282528877258,
"step": 650
},
{
"epoch": 1.4316939890710383,
"grad_norm": 104.74662245786296,
"learning_rate": 2.2560040352337307e-07,
"logits/chosen": -1.1930986642837524,
"logits/rejected": -1.0961310863494873,
"logps/chosen": -0.8049964904785156,
"logps/rejected": -2.6303577423095703,
"loss": 1.0368,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -8.049962997436523,
"rewards/margins": 18.253612518310547,
"rewards/rejected": -26.303577423095703,
"semantic_entropy": 0.8019247055053711,
"step": 655
},
{
"epoch": 1.4426229508196722,
"grad_norm": 74.14915143886914,
"learning_rate": 2.1766367922083283e-07,
"logits/chosen": -1.2195419073104858,
"logits/rejected": -1.1510334014892578,
"logps/chosen": -0.7229866981506348,
"logps/rejected": -2.4508605003356934,
"loss": 0.9204,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -7.229866981506348,
"rewards/margins": 17.278736114501953,
"rewards/rejected": -24.508602142333984,
"semantic_entropy": 0.8276729583740234,
"step": 660
},
{
"epoch": 1.453551912568306,
"grad_norm": 40.08916656671079,
"learning_rate": 2.0983005277905347e-07,
"logits/chosen": -1.25788152217865,
"logits/rejected": -1.1829631328582764,
"logps/chosen": -0.7363836765289307,
"logps/rejected": -2.4085285663604736,
"loss": 0.9793,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -7.363836765289307,
"rewards/margins": 16.721446990966797,
"rewards/rejected": -24.085285186767578,
"semantic_entropy": 0.8287376165390015,
"step": 665
},
{
"epoch": 1.46448087431694,
"grad_norm": 47.3989204733329,
"learning_rate": 2.021023847231202e-07,
"logits/chosen": -1.2234550714492798,
"logits/rejected": -1.1443179845809937,
"logps/chosen": -0.7974756956100464,
"logps/rejected": -2.3043999671936035,
"loss": 0.9905,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -7.974755764007568,
"rewards/margins": 15.069241523742676,
"rewards/rejected": -23.043996810913086,
"semantic_entropy": 0.8342604637145996,
"step": 670
},
{
"epoch": 1.4754098360655736,
"grad_norm": 108.78018960923754,
"learning_rate": 1.94483496886381e-07,
"logits/chosen": -1.1683439016342163,
"logits/rejected": -1.1087901592254639,
"logps/chosen": -0.6944879293441772,
"logps/rejected": -2.433687925338745,
"loss": 0.8989,
"rewards/accuracies": 0.9375,
"rewards/chosen": -6.944879055023193,
"rewards/margins": 17.391998291015625,
"rewards/rejected": -24.33687973022461,
"semantic_entropy": 0.8319599032402039,
"step": 675
},
{
"epoch": 1.4863387978142075,
"grad_norm": 60.19474027091482,
"learning_rate": 1.869761713800254e-07,
"logits/chosen": -1.2412843704223633,
"logits/rejected": -1.1452839374542236,
"logps/chosen": -0.831190288066864,
"logps/rejected": -2.4966881275177,
"loss": 1.0112,
"rewards/accuracies": 0.9375,
"rewards/chosen": -8.31190299987793,
"rewards/margins": 16.654979705810547,
"rewards/rejected": -24.966880798339844,
"semantic_entropy": 0.800665020942688,
"step": 680
},
{
"epoch": 1.4972677595628414,
"grad_norm": 45.288193362387666,
"learning_rate": 1.7958314957717064e-07,
"logits/chosen": -1.2326924800872803,
"logits/rejected": -1.1884281635284424,
"logps/chosen": -0.6524280309677124,
"logps/rejected": -2.181318998336792,
"loss": 0.9979,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -6.524280548095703,
"rewards/margins": 15.288909912109375,
"rewards/rejected": -21.813190460205078,
"semantic_entropy": 0.8463915586471558,
"step": 685
},
{
"epoch": 1.5081967213114753,
"grad_norm": 77.54652900736652,
"learning_rate": 1.7230713111182164e-07,
"logits/chosen": -1.2749425172805786,
"logits/rejected": -1.1991561651229858,
"logps/chosen": -0.6433757543563843,
"logps/rejected": -2.4266154766082764,
"loss": 0.9611,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -6.433757781982422,
"rewards/margins": 17.8323974609375,
"rewards/rejected": -24.266155242919922,
"semantic_entropy": 0.8604837656021118,
"step": 690
},
{
"epoch": 1.5191256830601092,
"grad_norm": 45.5200345735269,
"learning_rate": 1.651507728930739e-07,
"logits/chosen": -1.1950256824493408,
"logits/rejected": -1.131256103515625,
"logps/chosen": -0.6931561231613159,
"logps/rejected": -2.161853551864624,
"loss": 0.9934,
"rewards/accuracies": 0.90625,
"rewards/chosen": -6.931561470031738,
"rewards/margins": 14.686975479125977,
"rewards/rejected": -21.61853790283203,
"semantic_entropy": 0.8436753153800964,
"step": 695
},
{
"epoch": 1.530054644808743,
"grad_norm": 49.242008834049685,
"learning_rate": 1.5811668813491696e-07,
"logits/chosen": -1.3293455839157104,
"logits/rejected": -1.2231751680374146,
"logps/chosen": -0.7694125771522522,
"logps/rejected": -2.4189977645874023,
"loss": 0.978,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -7.694125175476074,
"rewards/margins": 16.495851516723633,
"rewards/rejected": -24.189977645874023,
"semantic_entropy": 0.8082691431045532,
"step": 700
},
{
"epoch": 1.540983606557377,
"grad_norm": 44.65399870377938,
"learning_rate": 1.5120744540199343e-07,
"logits/chosen": -1.2114274501800537,
"logits/rejected": -1.1308143138885498,
"logps/chosen": -0.7381525635719299,
"logps/rejected": -2.3527631759643555,
"loss": 0.9314,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -7.381524562835693,
"rewards/margins": 16.146106719970703,
"rewards/rejected": -23.527631759643555,
"semantic_entropy": 0.8333342671394348,
"step": 705
},
{
"epoch": 1.5519125683060109,
"grad_norm": 52.47246084045148,
"learning_rate": 1.4442556767166369e-07,
"logits/chosen": -1.2004725933074951,
"logits/rejected": -1.1394346952438354,
"logps/chosen": -0.7631191611289978,
"logps/rejected": -2.4908859729766846,
"loss": 1.0138,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -7.631192207336426,
"rewards/margins": 17.277666091918945,
"rewards/rejected": -24.908855438232422,
"semantic_entropy": 0.8088520169258118,
"step": 710
},
{
"epoch": 1.5628415300546448,
"grad_norm": 39.161062372274245,
"learning_rate": 1.377735314127148e-07,
"logits/chosen": -1.1989295482635498,
"logits/rejected": -1.0892112255096436,
"logps/chosen": -0.754266083240509,
"logps/rejected": -2.3557486534118652,
"loss": 0.9097,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -7.542661190032959,
"rewards/margins": 16.014827728271484,
"rewards/rejected": -23.5574893951416,
"semantic_entropy": 0.8200591206550598,
"step": 715
},
{
"epoch": 1.5737704918032787,
"grad_norm": 57.53753951235613,
"learning_rate": 1.312537656810549e-07,
"logits/chosen": -1.1801402568817139,
"logits/rejected": -1.1305280923843384,
"logps/chosen": -0.8796719312667847,
"logps/rejected": -2.6609649658203125,
"loss": 1.0603,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -8.79671859741211,
"rewards/margins": 17.812931060791016,
"rewards/rejected": -26.609649658203125,
"semantic_entropy": 0.7918781042098999,
"step": 720
},
{
"epoch": 1.5846994535519126,
"grad_norm": 51.68795375166876,
"learning_rate": 1.2486865123271866e-07,
"logits/chosen": -1.2510040998458862,
"logits/rejected": -1.1513909101486206,
"logps/chosen": -0.7905360460281372,
"logps/rejected": -2.450331449508667,
"loss": 0.988,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -7.905358791351318,
"rewards/margins": 16.59795570373535,
"rewards/rejected": -24.503314971923828,
"semantic_entropy": 0.811559796333313,
"step": 725
},
{
"epoch": 1.5956284153005464,
"grad_norm": 53.36439634728435,
"learning_rate": 1.1862051965451214e-07,
"logits/chosen": -1.2445173263549805,
"logits/rejected": -1.1288838386535645,
"logps/chosen": -0.7035760283470154,
"logps/rejected": -2.4538397789001465,
"loss": 0.9645,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -7.035760402679443,
"rewards/margins": 17.502635955810547,
"rewards/rejected": -24.53839683532715,
"semantic_entropy": 0.8314288258552551,
"step": 730
},
{
"epoch": 1.6065573770491803,
"grad_norm": 52.77186710891425,
"learning_rate": 1.1251165251261047e-07,
"logits/chosen": -1.1849864721298218,
"logits/rejected": -1.111053466796875,
"logps/chosen": -0.6819809675216675,
"logps/rejected": -2.3596489429473877,
"loss": 0.9183,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -6.819809913635254,
"rewards/margins": 16.77667999267578,
"rewards/rejected": -23.596487045288086,
"semantic_entropy": 0.8518983721733093,
"step": 735
},
{
"epoch": 1.6174863387978142,
"grad_norm": 51.04954161674348,
"learning_rate": 1.0654428051942138e-07,
"logits/chosen": -1.185575246810913,
"logits/rejected": -1.1258459091186523,
"logps/chosen": -0.8496238589286804,
"logps/rejected": -2.4404985904693604,
"loss": 1.0108,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -8.49623966217041,
"rewards/margins": 15.908746719360352,
"rewards/rejected": -24.404987335205078,
"semantic_entropy": 0.8217931985855103,
"step": 740
},
{
"epoch": 1.6284153005464481,
"grad_norm": 44.78590940359996,
"learning_rate": 1.0072058271901978e-07,
"logits/chosen": -1.1844556331634521,
"logits/rejected": -1.096343994140625,
"logps/chosen": -0.7650187611579895,
"logps/rejected": -2.4417996406555176,
"loss": 0.9889,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -7.6501874923706055,
"rewards/margins": 16.767807006835938,
"rewards/rejected": -24.41799545288086,
"semantic_entropy": 0.8134136199951172,
"step": 745
},
{
"epoch": 1.639344262295082,
"grad_norm": 41.35825995367568,
"learning_rate": 9.504268569144763e-08,
"logits/chosen": -1.2524887323379517,
"logits/rejected": -1.1518092155456543,
"logps/chosen": -0.6517141461372375,
"logps/rejected": -2.495558977127075,
"loss": 0.9019,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -6.517141819000244,
"rewards/margins": 18.438446044921875,
"rewards/rejected": -24.95558738708496,
"semantic_entropy": 0.8249934911727905,
"step": 750
},
{
"epoch": 1.650273224043716,
"grad_norm": 49.17981910139592,
"learning_rate": 8.951266277617325e-08,
"logits/chosen": -1.174800992012024,
"logits/rejected": -1.0904661417007446,
"logps/chosen": -0.6784438490867615,
"logps/rejected": -2.281085968017578,
"loss": 0.9285,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -6.784438133239746,
"rewards/margins": 16.026418685913086,
"rewards/rejected": -22.81085968017578,
"semantic_entropy": 0.8071689605712891,
"step": 755
},
{
"epoch": 1.6612021857923498,
"grad_norm": 55.44235074285089,
"learning_rate": 8.413253331499049e-08,
"logits/chosen": -1.2523894309997559,
"logits/rejected": -1.1709582805633545,
"logps/chosen": -0.7902460694313049,
"logps/rejected": -2.353731155395508,
"loss": 0.9701,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -7.90246057510376,
"rewards/margins": 15.634851455688477,
"rewards/rejected": -23.537311553955078,
"semantic_entropy": 0.8497117757797241,
"step": 760
},
{
"epoch": 1.6721311475409837,
"grad_norm": 46.939667617709574,
"learning_rate": 7.8904261914637e-08,
"logits/chosen": -1.2579504251480103,
"logits/rejected": -1.2005599737167358,
"logps/chosen": -0.7765697240829468,
"logps/rejected": -2.3420188426971436,
"loss": 1.0131,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -7.765698432922363,
"rewards/margins": 15.654492378234863,
"rewards/rejected": -23.420190811157227,
"semantic_entropy": 0.8250833749771118,
"step": 765
},
{
"epoch": 1.6830601092896176,
"grad_norm": 67.60864064581558,
"learning_rate": 7.382975772939865e-08,
"logits/chosen": -1.2617108821868896,
"logits/rejected": -1.2064878940582275,
"logps/chosen": -0.7011424899101257,
"logps/rejected": -2.4052655696868896,
"loss": 0.9795,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -7.011425971984863,
"rewards/margins": 17.04123306274414,
"rewards/rejected": -24.052656173706055,
"semantic_entropy": 0.8459088206291199,
"step": 770
},
{
"epoch": 1.6939890710382515,
"grad_norm": 68.25252330535682,
"learning_rate": 6.891087376396315e-08,
"logits/chosen": -1.1619203090667725,
"logits/rejected": -1.1151115894317627,
"logps/chosen": -0.6944946050643921,
"logps/rejected": -2.123880624771118,
"loss": 1.0529,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.9449462890625,
"rewards/margins": 14.293858528137207,
"rewards/rejected": -21.23880386352539,
"semantic_entropy": 0.8555929064750671,
"step": 775
},
{
"epoch": 1.7049180327868854,
"grad_norm": 58.94990698167926,
"learning_rate": 6.414940619677734e-08,
"logits/chosen": -1.21394944190979,
"logits/rejected": -1.148568034172058,
"logps/chosen": -0.7798916697502136,
"logps/rejected": -2.334639072418213,
"loss": 1.0831,
"rewards/accuracies": 0.9375,
"rewards/chosen": -7.798917293548584,
"rewards/margins": 15.547472953796387,
"rewards/rejected": -23.346389770507812,
"semantic_entropy": 0.8230711221694946,
"step": 780
},
{
"epoch": 1.7158469945355193,
"grad_norm": 54.978556677024066,
"learning_rate": 5.954709372415523e-08,
"logits/chosen": -1.2210636138916016,
"logits/rejected": -1.134007453918457,
"logps/chosen": -0.8276329040527344,
"logps/rejected": -2.5226263999938965,
"loss": 1.0036,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -8.27632999420166,
"rewards/margins": 16.949934005737305,
"rewards/rejected": -25.22626304626465,
"semantic_entropy": 0.8026347160339355,
"step": 785
},
{
"epoch": 1.7267759562841531,
"grad_norm": 58.44169076386046,
"learning_rate": 5.5105616925376296e-08,
"logits/chosen": -1.3411870002746582,
"logits/rejected": -1.1771245002746582,
"logps/chosen": -0.7094103097915649,
"logps/rejected": -2.3087127208709717,
"loss": 0.9863,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -7.094101905822754,
"rewards/margins": 15.993026733398438,
"rewards/rejected": -23.08713150024414,
"semantic_entropy": 0.8216876983642578,
"step": 790
},
{
"epoch": 1.737704918032787,
"grad_norm": 53.528578956238725,
"learning_rate": 5.082659764900482e-08,
"logits/chosen": -1.2835462093353271,
"logits/rejected": -1.2009773254394531,
"logps/chosen": -0.6398060917854309,
"logps/rejected": -2.0710248947143555,
"loss": 1.0059,
"rewards/accuracies": 0.9375,
"rewards/chosen": -6.3980607986450195,
"rewards/margins": 14.312187194824219,
"rewards/rejected": -20.710247039794922,
"semantic_entropy": 0.8597515225410461,
"step": 795
},
{
"epoch": 1.748633879781421,
"grad_norm": 59.57829603969701,
"learning_rate": 4.6711598420656976e-08,
"logits/chosen": -1.2482662200927734,
"logits/rejected": -1.1601988077163696,
"logps/chosen": -0.7208329439163208,
"logps/rejected": -2.314363956451416,
"loss": 0.9552,
"rewards/accuracies": 0.9375,
"rewards/chosen": -7.208329200744629,
"rewards/margins": 15.935308456420898,
"rewards/rejected": -23.143640518188477,
"semantic_entropy": 0.8409850001335144,
"step": 800
},
{
"epoch": 1.748633879781421,
"eval_logits/chosen": -1.5359925031661987,
"eval_logits/rejected": -1.4433752298355103,
"eval_logps/chosen": -0.8280417323112488,
"eval_logps/rejected": -2.125033140182495,
"eval_loss": 1.4401862621307373,
"eval_rewards/accuracies": 0.8795180916786194,
"eval_rewards/chosen": -8.280416488647461,
"eval_rewards/margins": 12.969916343688965,
"eval_rewards/rejected": -21.25033187866211,
"eval_runtime": 33.6039,
"eval_samples_per_second": 39.222,
"eval_semantic_entropy": 0.8376908898353577,
"eval_steps_per_second": 2.47,
"step": 800
},
{
"epoch": 1.7595628415300546,
"grad_norm": 44.1368033825106,
"learning_rate": 4.2762121872428615e-08,
"logits/chosen": -1.2641065120697021,
"logits/rejected": -1.2107889652252197,
"logps/chosen": -0.6843208074569702,
"logps/rejected": -2.0283682346343994,
"loss": 1.0256,
"rewards/accuracies": 0.96875,
"rewards/chosen": -6.843208312988281,
"rewards/margins": 13.440475463867188,
"rewards/rejected": -20.28368377685547,
"semantic_entropy": 0.8609482645988464,
"step": 805
},
{
"epoch": 1.7704918032786885,
"grad_norm": 60.8116220315975,
"learning_rate": 3.897961019419516e-08,
"logits/chosen": -1.242765188217163,
"logits/rejected": -1.111221194267273,
"logps/chosen": -0.6914607882499695,
"logps/rejected": -2.5515542030334473,
"loss": 1.026,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -6.914607048034668,
"rewards/margins": 18.600933074951172,
"rewards/rejected": -25.515541076660156,
"semantic_entropy": 0.8368776440620422,
"step": 810
},
{
"epoch": 1.7814207650273224,
"grad_norm": 48.041636594141835,
"learning_rate": 3.536544460698143e-08,
"logits/chosen": -1.2581889629364014,
"logits/rejected": -1.2215464115142822,
"logps/chosen": -0.7543720006942749,
"logps/rejected": -2.438751220703125,
"loss": 1.0363,
"rewards/accuracies": 0.9375,
"rewards/chosen": -7.5437211990356445,
"rewards/margins": 16.843791961669922,
"rewards/rejected": -24.387516021728516,
"semantic_entropy": 0.8024908304214478,
"step": 815
},
{
"epoch": 1.7923497267759563,
"grad_norm": 46.211811466738105,
"learning_rate": 3.192094485859526e-08,
"logits/chosen": -1.2139607667922974,
"logits/rejected": -1.1563109159469604,
"logps/chosen": -0.7942629456520081,
"logps/rejected": -2.2374846935272217,
"loss": 0.9534,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -7.942629337310791,
"rewards/margins": 14.432218551635742,
"rewards/rejected": -22.374849319458008,
"semantic_entropy": 0.8313804864883423,
"step": 820
},
{
"epoch": 1.8032786885245902,
"grad_norm": 51.498446681364456,
"learning_rate": 2.8647368741709367e-08,
"logits/chosen": -1.307348608970642,
"logits/rejected": -1.172135353088379,
"logps/chosen": -0.8334323167800903,
"logps/rejected": -2.4974188804626465,
"loss": 0.9931,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -8.334322929382324,
"rewards/margins": 16.63986587524414,
"rewards/rejected": -24.974185943603516,
"semantic_entropy": 0.7853243350982666,
"step": 825
},
{
"epoch": 1.814207650273224,
"grad_norm": 65.74184699216715,
"learning_rate": 2.5545911634565265e-08,
"logits/chosen": -1.2999436855316162,
"logits/rejected": -1.1716783046722412,
"logps/chosen": -0.7435690760612488,
"logps/rejected": -2.767209529876709,
"loss": 0.9785,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -7.435690402984619,
"rewards/margins": 20.236404418945312,
"rewards/rejected": -27.672094345092773,
"semantic_entropy": 0.8089765310287476,
"step": 830
},
{
"epoch": 1.825136612021858,
"grad_norm": 53.69755289327063,
"learning_rate": 2.261770606446983e-08,
"logits/chosen": -1.3077576160430908,
"logits/rejected": -1.2317638397216797,
"logps/chosen": -0.7318333387374878,
"logps/rejected": -1.9953196048736572,
"loss": 0.9652,
"rewards/accuracies": 0.9375,
"rewards/chosen": -7.318333625793457,
"rewards/margins": 12.634860038757324,
"rewards/rejected": -19.95319366455078,
"semantic_entropy": 0.8394795656204224,
"step": 835
},
{
"epoch": 1.8360655737704918,
"grad_norm": 48.57345276974053,
"learning_rate": 1.9863821294241522e-08,
"logits/chosen": -1.2126185894012451,
"logits/rejected": -1.10856032371521,
"logps/chosen": -0.7022706866264343,
"logps/rejected": -2.3867998123168945,
"loss": 0.9824,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -7.022706508636475,
"rewards/margins": 16.84528923034668,
"rewards/rejected": -23.86799430847168,
"semantic_entropy": 0.8377873301506042,
"step": 840
},
{
"epoch": 1.8469945355191257,
"grad_norm": 47.58974510921998,
"learning_rate": 1.7285262931759082e-08,
"logits/chosen": -1.170081615447998,
"logits/rejected": -1.1226613521575928,
"logps/chosen": -0.709827721118927,
"logps/rejected": -2.499692440032959,
"loss": 1.0049,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -7.0982770919799805,
"rewards/margins": 17.89864730834961,
"rewards/rejected": -24.99692726135254,
"semantic_entropy": 0.8213443756103516,
"step": 845
},
{
"epoch": 1.8579234972677594,
"grad_norm": 43.861213908082526,
"learning_rate": 1.4882972562753615e-08,
"logits/chosen": -1.2278581857681274,
"logits/rejected": -1.1186041831970215,
"logps/chosen": -0.6293253898620605,
"logps/rejected": -2.4325814247131348,
"loss": 0.9317,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -6.293253421783447,
"rewards/margins": 18.032560348510742,
"rewards/rejected": -24.325815200805664,
"semantic_entropy": 0.8304460644721985,
"step": 850
},
{
"epoch": 1.8688524590163933,
"grad_norm": 46.25639636622948,
"learning_rate": 1.2657827406979404e-08,
"logits/chosen": -1.2755509614944458,
"logits/rejected": -1.1995421648025513,
"logps/chosen": -0.7046025991439819,
"logps/rejected": -2.2888636589050293,
"loss": 0.9631,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -7.046026706695557,
"rewards/margins": 15.842610359191895,
"rewards/rejected": -22.88863754272461,
"semantic_entropy": 0.8367988467216492,
"step": 855
},
{
"epoch": 1.8797814207650272,
"grad_norm": 43.742641732125044,
"learning_rate": 1.0610639997888915e-08,
"logits/chosen": -1.144809603691101,
"logits/rejected": -1.0996748208999634,
"logps/chosen": -0.6617113947868347,
"logps/rejected": -2.071277141571045,
"loss": 0.9799,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -6.6171135902404785,
"rewards/margins": 14.095659255981445,
"rewards/rejected": -20.712770462036133,
"semantic_entropy": 0.8550912141799927,
"step": 860
},
{
"epoch": 1.890710382513661,
"grad_norm": 43.8556750169825,
"learning_rate": 8.742157885927804e-09,
"logits/chosen": -1.264917016029358,
"logits/rejected": -1.1865818500518799,
"logps/chosen": -0.7975755333900452,
"logps/rejected": -2.4832332134246826,
"loss": 0.9288,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -7.975754737854004,
"rewards/margins": 16.856576919555664,
"rewards/rejected": -24.832332611083984,
"semantic_entropy": 0.8138486742973328,
"step": 865
},
{
"epoch": 1.901639344262295,
"grad_norm": 49.4352171489155,
"learning_rate": 7.053063365559997e-09,
"logits/chosen": -1.2424798011779785,
"logits/rejected": -1.1954628229141235,
"logps/chosen": -0.6465862393379211,
"logps/rejected": -2.410433769226074,
"loss": 0.8832,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -6.465862274169922,
"rewards/margins": 17.638477325439453,
"rewards/rejected": -24.104337692260742,
"semantic_entropy": 0.8361645936965942,
"step": 870
},
{
"epoch": 1.9125683060109289,
"grad_norm": 79.71459811079978,
"learning_rate": 5.543973226120935e-09,
"logits/chosen": -1.2222373485565186,
"logits/rejected": -1.1502609252929688,
"logps/chosen": -0.7222265005111694,
"logps/rejected": -2.1863186359405518,
"loss": 0.9862,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -7.222264289855957,
"rewards/margins": 14.640920639038086,
"rewards/rejected": -21.86318588256836,
"semantic_entropy": 0.8562089800834656,
"step": 875
},
{
"epoch": 1.9234972677595628,
"grad_norm": 53.36096318687935,
"learning_rate": 4.215438526591064e-09,
"logits/chosen": -1.2770297527313232,
"logits/rejected": -1.2093579769134521,
"logps/chosen": -0.6959497332572937,
"logps/rejected": -2.2840352058410645,
"loss": 0.9871,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -6.959497928619385,
"rewards/margins": 15.880853652954102,
"rewards/rejected": -22.840351104736328,
"semantic_entropy": 0.849157452583313,
"step": 880
},
{
"epoch": 1.9344262295081966,
"grad_norm": 38.81388371132877,
"learning_rate": 3.0679443943712467e-09,
"logits/chosen": -1.3255574703216553,
"logits/rejected": -1.2370083332061768,
"logps/chosen": -0.7685250639915466,
"logps/rejected": -2.3793647289276123,
"loss": 0.9499,
"rewards/accuracies": 0.9375,
"rewards/chosen": -7.685250759124756,
"rewards/margins": 16.108396530151367,
"rewards/rejected": -23.79364585876465,
"semantic_entropy": 0.8148989677429199,
"step": 885
},
{
"epoch": 1.9453551912568305,
"grad_norm": 43.52784854249128,
"learning_rate": 2.1019098481337426e-09,
"logits/chosen": -1.271645188331604,
"logits/rejected": -1.1847755908966064,
"logps/chosen": -0.7262202501296997,
"logps/rejected": -2.480203151702881,
"loss": 0.9648,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -7.262202262878418,
"rewards/margins": 17.53982925415039,
"rewards/rejected": -24.802032470703125,
"semantic_entropy": 0.8072282671928406,
"step": 890
},
{
"epoch": 1.9562841530054644,
"grad_norm": 54.276319170574524,
"learning_rate": 1.3176876448135477e-09,
"logits/chosen": -1.311767816543579,
"logits/rejected": -1.1933305263519287,
"logps/chosen": -0.8360783457756042,
"logps/rejected": -2.5562148094177246,
"loss": 1.0277,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -8.360783576965332,
"rewards/margins": 17.201366424560547,
"rewards/rejected": -25.562149047851562,
"semantic_entropy": 0.8203535079956055,
"step": 895
},
{
"epoch": 1.9672131147540983,
"grad_norm": 50.371762692382795,
"learning_rate": 7.155641507955445e-10,
"logits/chosen": -1.2078804969787598,
"logits/rejected": -1.1214892864227295,
"logps/chosen": -0.6584422588348389,
"logps/rejected": -2.1391983032226562,
"loss": 1.026,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -6.5844221115112305,
"rewards/margins": 14.807560920715332,
"rewards/rejected": -21.391983032226562,
"semantic_entropy": 0.8509858250617981,
"step": 900
},
{
"epoch": 1.9781420765027322,
"grad_norm": 55.00348286428405,
"learning_rate": 2.957592373452056e-10,
"logits/chosen": -1.2071561813354492,
"logits/rejected": -1.1362513303756714,
"logps/chosen": -0.719018280506134,
"logps/rejected": -2.406873941421509,
"loss": 0.9953,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -7.190183162689209,
"rewards/margins": 16.87855339050293,
"rewards/rejected": -24.068737030029297,
"semantic_entropy": 0.8274633288383484,
"step": 905
},
{
"epoch": 1.989071038251366,
"grad_norm": 43.167771529061575,
"learning_rate": 5.842620032053824e-11,
"logits/chosen": -1.2589218616485596,
"logits/rejected": -1.189516544342041,
"logps/chosen": -0.7029792666435242,
"logps/rejected": -2.2207939624786377,
"loss": 0.9075,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -7.029792785644531,
"rewards/margins": 15.178146362304688,
"rewards/rejected": -22.20793914794922,
"semantic_entropy": 0.8508146405220032,
"step": 910
},
{
"epoch": 1.9978142076502732,
"step": 914,
"total_flos": 0.0,
"train_loss": 1.6402891297830795,
"train_runtime": 11806.3913,
"train_samples_per_second": 9.92,
"train_steps_per_second": 0.077
}
],
"logging_steps": 5,
"max_steps": 914,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}