{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 5000, "global_step": 4168, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002399232245681382, "grad_norm": 3.9086405364003305, "learning_rate": 1.199040767386091e-09, "logits/chosen": -0.9392852187156677, "logits/rejected": -0.9925774335861206, "logps/chosen": -164.85171508789062, "logps/rejected": -169.34266662597656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0023992322456813818, "grad_norm": 4.318184225673836, "learning_rate": 1.199040767386091e-08, "logits/chosen": -0.8653285503387451, "logits/rejected": -1.0646977424621582, "logps/chosen": -367.5494384765625, "logps/rejected": -308.0057067871094, "loss": 0.6931, "rewards/accuracies": 0.3611111044883728, "rewards/chosen": 0.00055171106941998, "rewards/margins": 0.00021127487707417458, "rewards/rejected": 0.0003404362651053816, "step": 10 }, { "epoch": 0.0047984644913627635, "grad_norm": 4.384399942785772, "learning_rate": 2.398081534772182e-08, "logits/chosen": -0.9145099520683289, "logits/rejected": -0.9615824818611145, "logps/chosen": -254.70645141601562, "logps/rejected": -225.65023803710938, "loss": 0.6933, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0004928931593894958, "rewards/margins": 0.0004294395330362022, "rewards/rejected": 6.345368456095457e-05, "step": 20 }, { "epoch": 0.007197696737044146, "grad_norm": 4.1919489271249395, "learning_rate": 3.597122302158273e-08, "logits/chosen": -1.0393908023834229, "logits/rejected": -1.1211938858032227, "logps/chosen": -247.6179962158203, "logps/rejected": -250.74832153320312, "loss": 0.6931, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0005728682735934854, "rewards/margins": -0.0005012283218093216, "rewards/rejected": -7.164000999182463e-05, "step": 30 }, { "epoch": 0.009596928982725527, "grad_norm": 4.043349234918003, "learning_rate": 4.796163069544364e-08, "logits/chosen": -1.0382745265960693, "logits/rejected": -1.1404989957809448, "logps/chosen": -246.5960693359375, "logps/rejected": -238.99038696289062, "loss": 0.6933, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0003935880376957357, "rewards/margins": 0.0007454471779055893, "rewards/rejected": -0.001139035215601325, "step": 40 }, { "epoch": 0.01199616122840691, "grad_norm": 4.337621377747828, "learning_rate": 5.995203836930455e-08, "logits/chosen": -0.9566876292228699, "logits/rejected": -1.0265729427337646, "logps/chosen": -273.5587463378906, "logps/rejected": -238.2271728515625, "loss": 0.6931, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0002776079345494509, "rewards/margins": -0.0013704797020182014, "rewards/rejected": 0.0010928716510534286, "step": 50 }, { "epoch": 0.014395393474088292, "grad_norm": 4.332693802131573, "learning_rate": 7.194244604316546e-08, "logits/chosen": -1.14139723777771, "logits/rejected": -1.063253402709961, "logps/chosen": -291.4471130371094, "logps/rejected": -265.26800537109375, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0003930005768779665, "rewards/margins": 0.00029442697996273637, "rewards/rejected": 9.857374243438244e-05, "step": 60 }, { "epoch": 0.016794625719769675, "grad_norm": 3.9392376744722797, "learning_rate": 8.393285371702638e-08, "logits/chosen": -0.7830671072006226, "logits/rejected": -0.8284071087837219, "logps/chosen": -280.4967346191406, "logps/rejected": -269.8634033203125, "loss": 0.6934, "rewards/accuracies": 0.5, "rewards/chosen": -0.00010640527762006968, "rewards/margins": -0.00039604370249435306, "rewards/rejected": 0.00028963852673768997, "step": 70 }, { "epoch": 0.019193857965451054, "grad_norm": 4.275834970816185, "learning_rate": 9.592326139088728e-08, "logits/chosen": -1.1247626543045044, "logits/rejected": -0.8464676141738892, "logps/chosen": -203.01101684570312, "logps/rejected": -241.64547729492188, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.00033823197009041905, "rewards/margins": 0.0008404625696130097, "rewards/rejected": -0.0005022305413149297, "step": 80 }, { "epoch": 0.021593090211132437, "grad_norm": 4.009980090025205, "learning_rate": 1.0791366906474819e-07, "logits/chosen": -1.128251552581787, "logits/rejected": -1.1966060400009155, "logps/chosen": -348.4684143066406, "logps/rejected": -300.92156982421875, "loss": 0.6931, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.00025963489315472543, "rewards/margins": 0.0006502953474409878, "rewards/rejected": -0.0003906603087671101, "step": 90 }, { "epoch": 0.02399232245681382, "grad_norm": 4.278362574458803, "learning_rate": 1.199040767386091e-07, "logits/chosen": -0.8752719759941101, "logits/rejected": -0.7615184783935547, "logps/chosen": -262.26171875, "logps/rejected": -279.4682312011719, "loss": 0.6929, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0001237258838955313, "rewards/margins": -0.00013651838526129723, "rewards/rejected": 1.2792646884918213e-05, "step": 100 }, { "epoch": 0.026391554702495202, "grad_norm": 3.7292949735641874, "learning_rate": 1.3189448441247004e-07, "logits/chosen": -1.054966688156128, "logits/rejected": -1.089815616607666, "logps/chosen": -232.7165069580078, "logps/rejected": -230.30648803710938, "loss": 0.6926, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0005149660282768309, "rewards/margins": 0.0010742491576820612, "rewards/rejected": -0.0015892151277512312, "step": 110 }, { "epoch": 0.028790786948176585, "grad_norm": 4.185972544798998, "learning_rate": 1.4388489208633092e-07, "logits/chosen": -0.9251031875610352, "logits/rejected": -1.0560011863708496, "logps/chosen": -302.79620361328125, "logps/rejected": -279.6351013183594, "loss": 0.6928, "rewards/accuracies": 0.375, "rewards/chosen": -0.0033786073327064514, "rewards/margins": -0.0015979796880856156, "rewards/rejected": -0.0017806284595280886, "step": 120 }, { "epoch": 0.031190019193857964, "grad_norm": 3.7577616139381282, "learning_rate": 1.5587529976019183e-07, "logits/chosen": -1.1069813966751099, "logits/rejected": -1.0163028240203857, "logps/chosen": -225.87887573242188, "logps/rejected": -308.16943359375, "loss": 0.6922, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0021240042988210917, "rewards/margins": 0.0019912621937692165, "rewards/rejected": -0.004115266725420952, "step": 130 }, { "epoch": 0.03358925143953935, "grad_norm": 3.9602176902490616, "learning_rate": 1.6786570743405277e-07, "logits/chosen": -0.8096126317977905, "logits/rejected": -0.844383716583252, "logps/chosen": -278.711181640625, "logps/rejected": -270.23455810546875, "loss": 0.6913, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0016858462477102876, "rewards/margins": 0.004555505700409412, "rewards/rejected": -0.0062413522973656654, "step": 140 }, { "epoch": 0.03598848368522073, "grad_norm": 4.198772547754269, "learning_rate": 1.7985611510791365e-07, "logits/chosen": -1.0384037494659424, "logits/rejected": -1.0555726289749146, "logps/chosen": -231.3898468017578, "logps/rejected": -225.4952392578125, "loss": 0.6919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0014209033688530326, "rewards/margins": 0.00509048905223608, "rewards/rejected": -0.006511392537504435, "step": 150 }, { "epoch": 0.03838771593090211, "grad_norm": 4.167240538791073, "learning_rate": 1.9184652278177456e-07, "logits/chosen": -0.8518667221069336, "logits/rejected": -0.9568248987197876, "logps/chosen": -296.21734619140625, "logps/rejected": -231.2320098876953, "loss": 0.6906, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.004124562256038189, "rewards/margins": 0.004930226132273674, "rewards/rejected": -0.009054789319634438, "step": 160 }, { "epoch": 0.040786948176583494, "grad_norm": 3.692310549028015, "learning_rate": 2.038369304556355e-07, "logits/chosen": -0.8354592323303223, "logits/rejected": -0.8758047819137573, "logps/chosen": -342.7477111816406, "logps/rejected": -333.38189697265625, "loss": 0.6903, "rewards/accuracies": 0.625, "rewards/chosen": -0.004680985119193792, "rewards/margins": 0.004498300142586231, "rewards/rejected": -0.009179284796118736, "step": 170 }, { "epoch": 0.04318618042226487, "grad_norm": 4.32405896978214, "learning_rate": 2.1582733812949638e-07, "logits/chosen": -1.1283237934112549, "logits/rejected": -1.1168252229690552, "logps/chosen": -238.8912353515625, "logps/rejected": -229.00265502929688, "loss": 0.6905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.004386520944535732, "rewards/margins": 0.007528006099164486, "rewards/rejected": -0.011914527975022793, "step": 180 }, { "epoch": 0.04558541266794626, "grad_norm": 4.666604305995101, "learning_rate": 2.278177458033573e-07, "logits/chosen": -0.9106446504592896, "logits/rejected": -0.9879466891288757, "logps/chosen": -306.4612121582031, "logps/rejected": -249.0087890625, "loss": 0.6894, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.006806717719882727, "rewards/margins": 0.007770798169076443, "rewards/rejected": -0.014577515423297882, "step": 190 }, { "epoch": 0.04798464491362764, "grad_norm": 3.9918872126977574, "learning_rate": 2.398081534772182e-07, "logits/chosen": -0.9901522397994995, "logits/rejected": -0.928848385810852, "logps/chosen": -313.17681884765625, "logps/rejected": -297.6922302246094, "loss": 0.6885, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.010936335660517216, "rewards/margins": 0.006564898882061243, "rewards/rejected": -0.017501235008239746, "step": 200 }, { "epoch": 0.05038387715930902, "grad_norm": 4.040620494299144, "learning_rate": 2.517985611510791e-07, "logits/chosen": -0.9027220606803894, "logits/rejected": -0.922700047492981, "logps/chosen": -230.9945831298828, "logps/rejected": -255.6648712158203, "loss": 0.6876, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.010718774050474167, "rewards/margins": 0.009615534916520119, "rewards/rejected": -0.020334308966994286, "step": 210 }, { "epoch": 0.052783109404990404, "grad_norm": 4.002893355744946, "learning_rate": 2.637889688249401e-07, "logits/chosen": -0.8777509927749634, "logits/rejected": -0.9541767239570618, "logps/chosen": -312.22064208984375, "logps/rejected": -314.44476318359375, "loss": 0.6874, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.018286144360899925, "rewards/margins": 0.010097065940499306, "rewards/rejected": -0.02838321030139923, "step": 220 }, { "epoch": 0.05518234165067178, "grad_norm": 4.397529514704007, "learning_rate": 2.7577937649880093e-07, "logits/chosen": -0.8843205571174622, "logits/rejected": -0.7930720448493958, "logps/chosen": -240.90969848632812, "logps/rejected": -279.2537841796875, "loss": 0.684, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.013729272410273552, "rewards/margins": 0.02465725876390934, "rewards/rejected": -0.03838653117418289, "step": 230 }, { "epoch": 0.05758157389635317, "grad_norm": 4.676021615108152, "learning_rate": 2.8776978417266184e-07, "logits/chosen": -1.0245535373687744, "logits/rejected": -1.0780936479568481, "logps/chosen": -303.7603454589844, "logps/rejected": -259.3138732910156, "loss": 0.6808, "rewards/accuracies": 0.75, "rewards/chosen": -0.010991424322128296, "rewards/margins": 0.02684735879302025, "rewards/rejected": -0.037838783115148544, "step": 240 }, { "epoch": 0.05998080614203455, "grad_norm": 4.463678598202064, "learning_rate": 2.997601918465228e-07, "logits/chosen": -0.9612535238265991, "logits/rejected": -1.0222301483154297, "logps/chosen": -241.61404418945312, "logps/rejected": -236.07644653320312, "loss": 0.68, "rewards/accuracies": 0.75, "rewards/chosen": -0.021920276805758476, "rewards/margins": 0.018481746315956116, "rewards/rejected": -0.04040202870965004, "step": 250 }, { "epoch": 0.06238003838771593, "grad_norm": 4.115268408123004, "learning_rate": 3.1175059952038366e-07, "logits/chosen": -1.0142881870269775, "logits/rejected": -0.8710586428642273, "logps/chosen": -263.0195617675781, "logps/rejected": -259.115478515625, "loss": 0.6769, "rewards/accuracies": 0.75, "rewards/chosen": -0.021990353241562843, "rewards/margins": 0.03786135092377663, "rewards/rejected": -0.05985169857740402, "step": 260 }, { "epoch": 0.0647792706333973, "grad_norm": 4.201440552672297, "learning_rate": 3.2374100719424457e-07, "logits/chosen": -0.9422982931137085, "logits/rejected": -1.1441442966461182, "logps/chosen": -290.52044677734375, "logps/rejected": -235.48049926757812, "loss": 0.6748, "rewards/accuracies": 0.625, "rewards/chosen": -0.028027933090925217, "rewards/margins": 0.016811534762382507, "rewards/rejected": -0.044839464128017426, "step": 270 }, { "epoch": 0.0671785028790787, "grad_norm": 4.458570754919466, "learning_rate": 3.3573141486810554e-07, "logits/chosen": -1.0597388744354248, "logits/rejected": -1.0107687711715698, "logps/chosen": -299.5600891113281, "logps/rejected": -287.019287109375, "loss": 0.6676, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03363611549139023, "rewards/margins": 0.046159304678440094, "rewards/rejected": -0.07979541271924973, "step": 280 }, { "epoch": 0.06957773512476008, "grad_norm": 3.937791865544782, "learning_rate": 3.477218225419664e-07, "logits/chosen": -0.9224345088005066, "logits/rejected": -0.8189966082572937, "logps/chosen": -291.11492919921875, "logps/rejected": -267.08172607421875, "loss": 0.6674, "rewards/accuracies": 0.75, "rewards/chosen": -0.04258224740624428, "rewards/margins": 0.05275702476501465, "rewards/rejected": -0.09533928334712982, "step": 290 }, { "epoch": 0.07197696737044146, "grad_norm": 4.881987417549259, "learning_rate": 3.597122302158273e-07, "logits/chosen": -1.013051152229309, "logits/rejected": -1.0528075695037842, "logps/chosen": -260.43798828125, "logps/rejected": -280.9106750488281, "loss": 0.6638, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.061515528708696365, "rewards/margins": 0.059713393449783325, "rewards/rejected": -0.12122891843318939, "step": 300 }, { "epoch": 0.07437619961612284, "grad_norm": 3.887121301077397, "learning_rate": 3.7170263788968827e-07, "logits/chosen": -0.8997815847396851, "logits/rejected": -0.9898494482040405, "logps/chosen": -277.71112060546875, "logps/rejected": -239.8883514404297, "loss": 0.6659, "rewards/accuracies": 0.75, "rewards/chosen": -0.058370210230350494, "rewards/margins": 0.08987968415021896, "rewards/rejected": -0.14824989438056946, "step": 310 }, { "epoch": 0.07677543186180422, "grad_norm": 3.767825032617774, "learning_rate": 3.836930455635491e-07, "logits/chosen": -1.0142638683319092, "logits/rejected": -1.0835198163986206, "logps/chosen": -283.21722412109375, "logps/rejected": -256.9454040527344, "loss": 0.6599, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0824267640709877, "rewards/margins": 0.06453671306371689, "rewards/rejected": -0.1469634771347046, "step": 320 }, { "epoch": 0.07917466410748561, "grad_norm": 4.101417109334993, "learning_rate": 3.9568345323741003e-07, "logits/chosen": -0.8915877342224121, "logits/rejected": -0.7875005006790161, "logps/chosen": -256.7735595703125, "logps/rejected": -309.68743896484375, "loss": 0.648, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.13278505206108093, "rewards/margins": 0.12722721695899963, "rewards/rejected": -0.26001226902008057, "step": 330 }, { "epoch": 0.08157389635316699, "grad_norm": 4.184212161663267, "learning_rate": 4.07673860911271e-07, "logits/chosen": -0.8056583404541016, "logits/rejected": -0.8734966516494751, "logps/chosen": -257.9537353515625, "logps/rejected": -290.716552734375, "loss": 0.6385, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.15207326412200928, "rewards/margins": 0.15626882016658783, "rewards/rejected": -0.3083421289920807, "step": 340 }, { "epoch": 0.08397312859884837, "grad_norm": 5.07860606766039, "learning_rate": 4.1966426858513185e-07, "logits/chosen": -1.1134240627288818, "logits/rejected": -1.1105023622512817, "logps/chosen": -308.1408386230469, "logps/rejected": -322.1429748535156, "loss": 0.6317, "rewards/accuracies": 0.75, "rewards/chosen": -0.37560468912124634, "rewards/margins": 0.14869387447834015, "rewards/rejected": -0.5242985486984253, "step": 350 }, { "epoch": 0.08637236084452975, "grad_norm": 6.681443438336797, "learning_rate": 4.3165467625899276e-07, "logits/chosen": -0.9821847677230835, "logits/rejected": -1.1448824405670166, "logps/chosen": -324.4971618652344, "logps/rejected": -285.352294921875, "loss": 0.6325, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5944857001304626, "rewards/margins": 0.16079989075660706, "rewards/rejected": -0.7552856206893921, "step": 360 }, { "epoch": 0.08877159309021113, "grad_norm": 5.547129015189758, "learning_rate": 4.436450839328537e-07, "logits/chosen": -0.9680454134941101, "logits/rejected": -0.8957147598266602, "logps/chosen": -285.62225341796875, "logps/rejected": -329.6009826660156, "loss": 0.6155, "rewards/accuracies": 0.75, "rewards/chosen": -0.507738471031189, "rewards/margins": 0.350941002368927, "rewards/rejected": -0.858679473400116, "step": 370 }, { "epoch": 0.09117082533589252, "grad_norm": 5.005904357466744, "learning_rate": 4.556354916067146e-07, "logits/chosen": -1.0903767347335815, "logits/rejected": -1.0195515155792236, "logps/chosen": -279.7075500488281, "logps/rejected": -327.0304870605469, "loss": 0.5837, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.42896413803100586, "rewards/margins": 0.4645144045352936, "rewards/rejected": -0.8934786915779114, "step": 380 }, { "epoch": 0.0935700575815739, "grad_norm": 6.3368430821651085, "learning_rate": 4.676258992805755e-07, "logits/chosen": -0.9331681132316589, "logits/rejected": -0.974704384803772, "logps/chosen": -358.29052734375, "logps/rejected": -354.2314147949219, "loss": 0.6125, "rewards/accuracies": 0.625, "rewards/chosen": -0.9378200769424438, "rewards/margins": 0.23260822892189026, "rewards/rejected": -1.1704282760620117, "step": 390 }, { "epoch": 0.09596928982725528, "grad_norm": 4.993194981103021, "learning_rate": 4.796163069544364e-07, "logits/chosen": -0.9397958517074585, "logits/rejected": -1.0308669805526733, "logps/chosen": -327.28485107421875, "logps/rejected": -373.20538330078125, "loss": 0.6133, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6638423800468445, "rewards/margins": 0.6587511897087097, "rewards/rejected": -1.3225935697555542, "step": 400 }, { "epoch": 0.09836852207293666, "grad_norm": 5.627853897652041, "learning_rate": 4.916067146282974e-07, "logits/chosen": -1.0697743892669678, "logits/rejected": -1.02415931224823, "logps/chosen": -302.128173828125, "logps/rejected": -373.0263366699219, "loss": 0.5737, "rewards/accuracies": 0.75, "rewards/chosen": -0.543385922908783, "rewards/margins": 0.5250757932662964, "rewards/rejected": -1.0684617757797241, "step": 410 }, { "epoch": 0.10076775431861804, "grad_norm": 7.180880498197233, "learning_rate": 4.999992108529978e-07, "logits/chosen": -0.9345219731330872, "logits/rejected": -0.9572717547416687, "logps/chosen": -444.15997314453125, "logps/rejected": -470.5298767089844, "loss": 0.6004, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.090321660041809, "rewards/margins": 0.609116792678833, "rewards/rejected": -1.6994386911392212, "step": 420 }, { "epoch": 0.10316698656429943, "grad_norm": 11.482376711377093, "learning_rate": 4.999851817115532e-07, "logits/chosen": -1.1403158903121948, "logits/rejected": -1.0577712059020996, "logps/chosen": -349.0794982910156, "logps/rejected": -424.08642578125, "loss": 0.598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9095686078071594, "rewards/margins": 0.863958477973938, "rewards/rejected": -1.773526906967163, "step": 430 }, { "epoch": 0.10556621880998081, "grad_norm": 5.363358928784856, "learning_rate": 4.999536171027889e-07, "logits/chosen": -0.8907009363174438, "logits/rejected": -0.991034984588623, "logps/chosen": -319.98046875, "logps/rejected": -347.8736877441406, "loss": 0.5812, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6185662150382996, "rewards/margins": 0.3348569869995117, "rewards/rejected": -0.9534232020378113, "step": 440 }, { "epoch": 0.10796545105566219, "grad_norm": 5.047279311794404, "learning_rate": 4.999045192408369e-07, "logits/chosen": -1.0067179203033447, "logits/rejected": -1.0287652015686035, "logps/chosen": -324.7417907714844, "logps/rejected": -358.2411804199219, "loss": 0.585, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.852948784828186, "rewards/margins": 0.47164326906204224, "rewards/rejected": -1.324592113494873, "step": 450 }, { "epoch": 0.11036468330134357, "grad_norm": 11.251777016471365, "learning_rate": 4.998378915697171e-07, "logits/chosen": -0.9187090992927551, "logits/rejected": -0.9824856519699097, "logps/chosen": -357.00994873046875, "logps/rejected": -421.2936096191406, "loss": 0.5563, "rewards/accuracies": 0.75, "rewards/chosen": -0.7373157739639282, "rewards/margins": 0.7148237824440002, "rewards/rejected": -1.4521396160125732, "step": 460 }, { "epoch": 0.11276391554702495, "grad_norm": 8.58525552938624, "learning_rate": 4.997537387630958e-07, "logits/chosen": -1.0368196964263916, "logits/rejected": -1.0864078998565674, "logps/chosen": -331.54705810546875, "logps/rejected": -414.0245666503906, "loss": 0.5324, "rewards/accuracies": 0.75, "rewards/chosen": -1.0620375871658325, "rewards/margins": 0.7675203084945679, "rewards/rejected": -1.82955801486969, "step": 470 }, { "epoch": 0.11516314779270634, "grad_norm": 7.973906104493475, "learning_rate": 4.996520667239582e-07, "logits/chosen": -1.2711211442947388, "logits/rejected": -1.1430588960647583, "logps/chosen": -344.3868408203125, "logps/rejected": -475.3819885253906, "loss": 0.5449, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9513166546821594, "rewards/margins": 0.8125057220458984, "rewards/rejected": -1.7638225555419922, "step": 480 }, { "epoch": 0.11756238003838772, "grad_norm": 7.29273708493132, "learning_rate": 4.995328825841939e-07, "logits/chosen": -0.945563793182373, "logits/rejected": -0.9637011289596558, "logps/chosen": -333.838134765625, "logps/rejected": -502.4449157714844, "loss": 0.5398, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9119874238967896, "rewards/margins": 1.7599906921386719, "rewards/rejected": -2.671978235244751, "step": 490 }, { "epoch": 0.1199616122840691, "grad_norm": 8.408190304146231, "learning_rate": 4.993961947040967e-07, "logits/chosen": -0.9354039430618286, "logits/rejected": -1.008681297302246, "logps/chosen": -389.75677490234375, "logps/rejected": -416.6995544433594, "loss": 0.5543, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0735552310943604, "rewards/margins": 0.6031023263931274, "rewards/rejected": -1.6766574382781982, "step": 500 }, { "epoch": 0.12236084452975048, "grad_norm": 7.547972255202871, "learning_rate": 4.992420126717784e-07, "logits/chosen": -1.0642093420028687, "logits/rejected": -1.0110970735549927, "logps/chosen": -349.2254333496094, "logps/rejected": -501.6767578125, "loss": 0.5416, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9027541875839233, "rewards/margins": 1.5682001113891602, "rewards/rejected": -2.470954418182373, "step": 510 }, { "epoch": 0.12476007677543186, "grad_norm": 6.11542459949028, "learning_rate": 4.990703473024958e-07, "logits/chosen": -0.8702675104141235, "logits/rejected": -1.03139066696167, "logps/chosen": -410.02264404296875, "logps/rejected": -504.4461364746094, "loss": 0.5536, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.188328742980957, "rewards/margins": 1.0218807458877563, "rewards/rejected": -2.210209608078003, "step": 520 }, { "epoch": 0.12715930902111325, "grad_norm": 9.452659503317662, "learning_rate": 4.98881210637893e-07, "logits/chosen": -1.1504271030426025, "logits/rejected": -1.0904886722564697, "logps/chosen": -299.77197265625, "logps/rejected": -411.39520263671875, "loss": 0.5477, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.7923839092254639, "rewards/margins": 0.7800144553184509, "rewards/rejected": -1.5723984241485596, "step": 530 }, { "epoch": 0.1295585412667946, "grad_norm": 15.54174844903523, "learning_rate": 4.986746159451553e-07, "logits/chosen": -1.0087683200836182, "logits/rejected": -1.0048372745513916, "logps/chosen": -348.1635437011719, "logps/rejected": -465.615234375, "loss": 0.5681, "rewards/accuracies": 0.625, "rewards/chosen": -0.949160099029541, "rewards/margins": 1.2487725019454956, "rewards/rejected": -2.197932720184326, "step": 540 }, { "epoch": 0.131957773512476, "grad_norm": 6.811166436392921, "learning_rate": 4.984505777160795e-07, "logits/chosen": -0.8339638710021973, "logits/rejected": -0.8557635545730591, "logps/chosen": -368.79998779296875, "logps/rejected": -455.84423828125, "loss": 0.5638, "rewards/accuracies": 0.75, "rewards/chosen": -0.7133311629295349, "rewards/margins": 0.8222800493240356, "rewards/rejected": -1.5356113910675049, "step": 550 }, { "epoch": 0.1343570057581574, "grad_norm": 8.38746715084373, "learning_rate": 4.982091116660574e-07, "logits/chosen": -0.9729937314987183, "logits/rejected": -1.1065596342086792, "logps/chosen": -269.25994873046875, "logps/rejected": -294.3720703125, "loss": 0.5681, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6644600629806519, "rewards/margins": 0.42904800176620483, "rewards/rejected": -1.0935081243515015, "step": 560 }, { "epoch": 0.13675623800383876, "grad_norm": 24.438576955287925, "learning_rate": 4.979502347329732e-07, "logits/chosen": -0.8234192132949829, "logits/rejected": -0.8349748849868774, "logps/chosen": -419.22637939453125, "logps/rejected": -574.6170654296875, "loss": 0.5309, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4598208665847778, "rewards/margins": 1.224372148513794, "rewards/rejected": -2.6841928958892822, "step": 570 }, { "epoch": 0.13915547024952016, "grad_norm": 8.10566134085291, "learning_rate": 4.976739650760151e-07, "logits/chosen": -1.0753071308135986, "logits/rejected": -1.1201988458633423, "logps/chosen": -420.6094665527344, "logps/rejected": -514.7296142578125, "loss": 0.5764, "rewards/accuracies": 0.75, "rewards/chosen": -1.6122499704360962, "rewards/margins": 1.0379631519317627, "rewards/rejected": -2.6502132415771484, "step": 580 }, { "epoch": 0.14155470249520152, "grad_norm": 6.442606806291236, "learning_rate": 4.97380322074402e-07, "logits/chosen": -0.7010489702224731, "logits/rejected": -0.7692248225212097, "logps/chosen": -344.9544982910156, "logps/rejected": -453.3834533691406, "loss": 0.5796, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.21074640750885, "rewards/margins": 1.0821633338928223, "rewards/rejected": -2.292909860610962, "step": 590 }, { "epoch": 0.14395393474088292, "grad_norm": 8.67810882207825, "learning_rate": 4.970693263260237e-07, "logits/chosen": -0.9984515905380249, "logits/rejected": -1.0792922973632812, "logps/chosen": -385.07855224609375, "logps/rejected": -431.91748046875, "loss": 0.5352, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8377297520637512, "rewards/margins": 0.9412263631820679, "rewards/rejected": -1.7789561748504639, "step": 600 }, { "epoch": 0.1463531669865643, "grad_norm": 19.243246370458866, "learning_rate": 4.967409996459966e-07, "logits/chosen": -0.9357202649116516, "logits/rejected": -0.966041088104248, "logps/chosen": -379.83709716796875, "logps/rejected": -436.16021728515625, "loss": 0.531, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2088489532470703, "rewards/margins": 0.8029053807258606, "rewards/rejected": -2.011754274368286, "step": 610 }, { "epoch": 0.14875239923224567, "grad_norm": 15.946020238913784, "learning_rate": 4.963953650651326e-07, "logits/chosen": -0.836955726146698, "logits/rejected": -0.9188618659973145, "logps/chosen": -472.018798828125, "logps/rejected": -462.2901306152344, "loss": 0.5215, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2413372993469238, "rewards/margins": 0.8713501691818237, "rewards/rejected": -2.112687587738037, "step": 620 }, { "epoch": 0.15115163147792707, "grad_norm": 7.979287445807055, "learning_rate": 4.960324468283248e-07, "logits/chosen": -1.00057053565979, "logits/rejected": -1.0442100763320923, "logps/chosen": -290.08868408203125, "logps/rejected": -377.97027587890625, "loss": 0.515, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.6758447885513306, "rewards/margins": 0.8780001401901245, "rewards/rejected": -1.5538448095321655, "step": 630 }, { "epoch": 0.15355086372360843, "grad_norm": 10.945616218615863, "learning_rate": 4.956522703928451e-07, "logits/chosen": -0.978852391242981, "logits/rejected": -0.8796356916427612, "logps/chosen": -318.9521484375, "logps/rejected": -465.2935485839844, "loss": 0.5063, "rewards/accuracies": 0.75, "rewards/chosen": -0.8167131543159485, "rewards/margins": 1.364931344985962, "rewards/rejected": -2.1816444396972656, "step": 640 }, { "epoch": 0.15595009596928983, "grad_norm": 12.631776767430628, "learning_rate": 4.952548624265606e-07, "logits/chosen": -0.9039742350578308, "logits/rejected": -0.9406811594963074, "logps/chosen": -403.57562255859375, "logps/rejected": -478.9420471191406, "loss": 0.5638, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3750524520874023, "rewards/margins": 0.8406556248664856, "rewards/rejected": -2.215708017349243, "step": 650 }, { "epoch": 0.15834932821497122, "grad_norm": 6.1260020762155145, "learning_rate": 4.948402508060607e-07, "logits/chosen": -1.0212910175323486, "logits/rejected": -1.0476784706115723, "logps/chosen": -298.73028564453125, "logps/rejected": -382.78021240234375, "loss": 0.542, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6404946446418762, "rewards/margins": 0.9978824853897095, "rewards/rejected": -1.6383771896362305, "step": 660 }, { "epoch": 0.16074856046065258, "grad_norm": 9.288255831934693, "learning_rate": 4.944084646147038e-07, "logits/chosen": -0.869672417640686, "logits/rejected": -0.9254360198974609, "logps/chosen": -365.64019775390625, "logps/rejected": -378.85113525390625, "loss": 0.5793, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5646086931228638, "rewards/margins": 0.3554513156414032, "rewards/rejected": -0.9200600385665894, "step": 670 }, { "epoch": 0.16314779270633398, "grad_norm": 12.016810333669639, "learning_rate": 4.939595341405754e-07, "logits/chosen": -0.8810294270515442, "logits/rejected": -0.9110749363899231, "logps/chosen": -320.2789001464844, "logps/rejected": -362.8446960449219, "loss": 0.5238, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6605560183525085, "rewards/margins": 0.6321157217025757, "rewards/rejected": -1.2926716804504395, "step": 680 }, { "epoch": 0.16554702495201534, "grad_norm": 9.358664871218739, "learning_rate": 4.93493490874365e-07, "logits/chosen": -0.9154292941093445, "logits/rejected": -0.9222054481506348, "logps/chosen": -362.84686279296875, "logps/rejected": -441.8236389160156, "loss": 0.5367, "rewards/accuracies": 0.75, "rewards/chosen": -1.235999345779419, "rewards/margins": 0.7342005968093872, "rewards/rejected": -1.9701995849609375, "step": 690 }, { "epoch": 0.16794625719769674, "grad_norm": 9.152240736767983, "learning_rate": 4.93010367507156e-07, "logits/chosen": -1.0514498949050903, "logits/rejected": -1.0271055698394775, "logps/chosen": -301.6610107421875, "logps/rejected": -385.259521484375, "loss": 0.5134, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8431800603866577, "rewards/margins": 1.0639536380767822, "rewards/rejected": -1.90713369846344, "step": 700 }, { "epoch": 0.17034548944337813, "grad_norm": 12.567284708252304, "learning_rate": 4.925101979281332e-07, "logits/chosen": -0.9216286540031433, "logits/rejected": -1.0931814908981323, "logps/chosen": -433.27130126953125, "logps/rejected": -560.4276123046875, "loss": 0.5031, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2827928066253662, "rewards/margins": 1.7925609350204468, "rewards/rejected": -3.0753538608551025, "step": 710 }, { "epoch": 0.1727447216890595, "grad_norm": 11.741747616486963, "learning_rate": 4.919930172222054e-07, "logits/chosen": -0.9665408134460449, "logits/rejected": -1.0536162853240967, "logps/chosen": -417.403076171875, "logps/rejected": -570.5277099609375, "loss": 0.5003, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7012180089950562, "rewards/margins": 1.562795877456665, "rewards/rejected": -3.2640137672424316, "step": 720 }, { "epoch": 0.1751439539347409, "grad_norm": 11.733941979044587, "learning_rate": 4.914588616675445e-07, "logits/chosen": -1.0246481895446777, "logits/rejected": -1.0158023834228516, "logps/chosen": -348.80133056640625, "logps/rejected": -439.29608154296875, "loss": 0.5572, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0424585342407227, "rewards/margins": 1.09738290309906, "rewards/rejected": -2.1398415565490723, "step": 730 }, { "epoch": 0.17754318618042225, "grad_norm": 8.84337454037855, "learning_rate": 4.909077687330404e-07, "logits/chosen": -0.8301714658737183, "logits/rejected": -0.9135535359382629, "logps/chosen": -330.27264404296875, "logps/rejected": -350.2643127441406, "loss": 0.5051, "rewards/accuracies": 0.75, "rewards/chosen": -0.6734127402305603, "rewards/margins": 0.41529732942581177, "rewards/rejected": -1.088710069656372, "step": 740 }, { "epoch": 0.17994241842610365, "grad_norm": 8.880233466088285, "learning_rate": 4.903397770756729e-07, "logits/chosen": -1.0296955108642578, "logits/rejected": -1.0950233936309814, "logps/chosen": -355.98968505859375, "logps/rejected": -452.5968322753906, "loss": 0.5248, "rewards/accuracies": 0.75, "rewards/chosen": -0.9854777455329895, "rewards/margins": 0.9587169885635376, "rewards/rejected": -1.9441944360733032, "step": 750 }, { "epoch": 0.18234165067178504, "grad_norm": 10.737134261298277, "learning_rate": 4.897549265378004e-07, "logits/chosen": -0.9651594161987305, "logits/rejected": -1.0333675146102905, "logps/chosen": -473.6534729003906, "logps/rejected": -615.0003662109375, "loss": 0.4966, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.580303430557251, "rewards/margins": 1.4406054019927979, "rewards/rejected": -3.020908832550049, "step": 760 }, { "epoch": 0.1847408829174664, "grad_norm": 10.188381001048064, "learning_rate": 4.891532581443643e-07, "logits/chosen": -1.1801836490631104, "logits/rejected": -1.2248878479003906, "logps/chosen": -438.201171875, "logps/rejected": -577.3571166992188, "loss": 0.4856, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.3580670356750488, "rewards/margins": 1.4186818599700928, "rewards/rejected": -2.7767486572265625, "step": 770 }, { "epoch": 0.1871401151631478, "grad_norm": 13.185729895925714, "learning_rate": 4.885348141000122e-07, "logits/chosen": -1.06368887424469, "logits/rejected": -1.0469920635223389, "logps/chosen": -363.1687316894531, "logps/rejected": -502.82684326171875, "loss": 0.4839, "rewards/accuracies": 0.75, "rewards/chosen": -1.2150341272354126, "rewards/margins": 1.3010752201080322, "rewards/rejected": -2.5161094665527344, "step": 780 }, { "epoch": 0.18953934740882916, "grad_norm": 13.52232678239933, "learning_rate": 4.878996377861367e-07, "logits/chosen": -1.0281708240509033, "logits/rejected": -1.094001054763794, "logps/chosen": -321.34814453125, "logps/rejected": -453.59417724609375, "loss": 0.5211, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.0169535875320435, "rewards/margins": 1.2799100875854492, "rewards/rejected": -2.2968640327453613, "step": 790 }, { "epoch": 0.19193857965451055, "grad_norm": 8.497366103850682, "learning_rate": 4.872477737578327e-07, "logits/chosen": -1.005568504333496, "logits/rejected": -0.9401241540908813, "logps/chosen": -398.41546630859375, "logps/rejected": -612.4130249023438, "loss": 0.4468, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.1582279205322266, "rewards/margins": 2.2199604511260986, "rewards/rejected": -3.378188371658325, "step": 800 }, { "epoch": 0.19433781190019195, "grad_norm": 17.573875043998388, "learning_rate": 4.865792677407718e-07, "logits/chosen": -1.0809205770492554, "logits/rejected": -1.1499931812286377, "logps/chosen": -387.76202392578125, "logps/rejected": -490.18243408203125, "loss": 0.5666, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.46564781665802, "rewards/margins": 1.2134864330291748, "rewards/rejected": -2.6791341304779053, "step": 810 }, { "epoch": 0.1967370441458733, "grad_norm": 8.204522849107846, "learning_rate": 4.858941666279955e-07, "logits/chosen": -0.8947283029556274, "logits/rejected": -0.9740638732910156, "logps/chosen": -393.2839660644531, "logps/rejected": -441.0948791503906, "loss": 0.5435, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2646175622940063, "rewards/margins": 0.5960845947265625, "rewards/rejected": -1.8607019186019897, "step": 820 }, { "epoch": 0.1991362763915547, "grad_norm": 10.142292221516385, "learning_rate": 4.851925184766247e-07, "logits/chosen": -1.053379774093628, "logits/rejected": -1.1101844310760498, "logps/chosen": -351.99090576171875, "logps/rejected": -449.01751708984375, "loss": 0.4937, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9777849316596985, "rewards/margins": 1.14426851272583, "rewards/rejected": -2.122053623199463, "step": 830 }, { "epoch": 0.20153550863723607, "grad_norm": 12.034416836065418, "learning_rate": 4.844743725044897e-07, "logits/chosen": -1.0946062803268433, "logits/rejected": -1.2976312637329102, "logps/chosen": -378.5492248535156, "logps/rejected": -474.7021484375, "loss": 0.5084, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2278741598129272, "rewards/margins": 1.197396159172058, "rewards/rejected": -2.4252700805664062, "step": 840 }, { "epoch": 0.20393474088291746, "grad_norm": 8.290745524509466, "learning_rate": 4.837397790866774e-07, "logits/chosen": -1.2077043056488037, "logits/rejected": -1.2064851522445679, "logps/chosen": -398.6294250488281, "logps/rejected": -532.10009765625, "loss": 0.5617, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0644519329071045, "rewards/margins": 1.5777161121368408, "rewards/rejected": -2.642167806625366, "step": 850 }, { "epoch": 0.20633397312859886, "grad_norm": 9.121636380617016, "learning_rate": 4.829887897519974e-07, "logits/chosen": -1.2348374128341675, "logits/rejected": -1.2065662145614624, "logps/chosen": -323.52520751953125, "logps/rejected": -456.983642578125, "loss": 0.5042, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9337062835693359, "rewards/margins": 1.160902500152588, "rewards/rejected": -2.094609022140503, "step": 860 }, { "epoch": 0.20873320537428022, "grad_norm": 10.124524555819926, "learning_rate": 4.82221457179368e-07, "logits/chosen": -1.2232173681259155, "logits/rejected": -1.2059452533721924, "logps/chosen": -376.4814147949219, "logps/rejected": -523.2281494140625, "loss": 0.4462, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.0080255270004272, "rewards/margins": 1.6320230960845947, "rewards/rejected": -2.6400485038757324, "step": 870 }, { "epoch": 0.21113243761996162, "grad_norm": 15.383829882801075, "learning_rate": 4.814378351941206e-07, "logits/chosen": -1.1030246019363403, "logits/rejected": -1.165531873703003, "logps/chosen": -376.2377014160156, "logps/rejected": -447.36871337890625, "loss": 0.5179, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1712180376052856, "rewards/margins": 0.8365498781204224, "rewards/rejected": -2.007767915725708, "step": 880 }, { "epoch": 0.21353166986564298, "grad_norm": 8.924221297687437, "learning_rate": 4.806379787642241e-07, "logits/chosen": -1.13001549243927, "logits/rejected": -1.117497205734253, "logps/chosen": -358.25286865234375, "logps/rejected": -490.5638732910156, "loss": 0.5141, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0755687952041626, "rewards/margins": 1.3798635005950928, "rewards/rejected": -2.4554319381713867, "step": 890 }, { "epoch": 0.21593090211132437, "grad_norm": 9.091481860281236, "learning_rate": 4.798219439964293e-07, "logits/chosen": -1.1416652202606201, "logits/rejected": -1.2265089750289917, "logps/chosen": -366.56915283203125, "logps/rejected": -414.20721435546875, "loss": 0.476, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.119057059288025, "rewards/margins": 0.3352181911468506, "rewards/rejected": -1.454275131225586, "step": 900 }, { "epoch": 0.21833013435700577, "grad_norm": 17.10074312802646, "learning_rate": 4.78989788132333e-07, "logits/chosen": -1.1152770519256592, "logits/rejected": -1.1071698665618896, "logps/chosen": -342.0806884765625, "logps/rejected": -534.8381958007812, "loss": 0.4536, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.19199538230896, "rewards/margins": 1.8661339282989502, "rewards/rejected": -3.0581297874450684, "step": 910 }, { "epoch": 0.22072936660268713, "grad_norm": 9.85354436821844, "learning_rate": 4.781415695443631e-07, "logits/chosen": -1.1570137739181519, "logits/rejected": -1.1956650018692017, "logps/chosen": -509.591552734375, "logps/rejected": -693.6530151367188, "loss": 0.5105, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.156250476837158, "rewards/margins": 1.8081929683685303, "rewards/rejected": -3.9644439220428467, "step": 920 }, { "epoch": 0.22312859884836853, "grad_norm": 10.93257575281564, "learning_rate": 4.772773477316836e-07, "logits/chosen": -1.0778967142105103, "logits/rejected": -1.133569598197937, "logps/chosen": -377.20037841796875, "logps/rejected": -475.40997314453125, "loss": 0.4849, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1448915004730225, "rewards/margins": 0.9261860847473145, "rewards/rejected": -2.071077823638916, "step": 930 }, { "epoch": 0.2255278310940499, "grad_norm": 13.493444996052222, "learning_rate": 4.7639718331602117e-07, "logits/chosen": -1.0905169248580933, "logits/rejected": -1.1160600185394287, "logps/chosen": -434.937744140625, "logps/rejected": -650.6595458984375, "loss": 0.501, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.5431007146835327, "rewards/margins": 2.3286290168762207, "rewards/rejected": -3.871730327606201, "step": 940 }, { "epoch": 0.22792706333973128, "grad_norm": 17.479671128658037, "learning_rate": 4.7550113803741275e-07, "logits/chosen": -1.1507641077041626, "logits/rejected": -1.2998677492141724, "logps/chosen": -448.26251220703125, "logps/rejected": -497.29779052734375, "loss": 0.4886, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5616766214370728, "rewards/margins": 1.1937475204467773, "rewards/rejected": -2.7554240226745605, "step": 950 }, { "epoch": 0.23032629558541268, "grad_norm": 17.248167606427355, "learning_rate": 4.7458927474987454e-07, "logits/chosen": -1.0716644525527954, "logits/rejected": -1.1447921991348267, "logps/chosen": -426.6358947753906, "logps/rejected": -470.9082946777344, "loss": 0.4858, "rewards/accuracies": 0.75, "rewards/chosen": -1.1936933994293213, "rewards/margins": 1.0052495002746582, "rewards/rejected": -2.1989428997039795, "step": 960 }, { "epoch": 0.23272552783109404, "grad_norm": 14.729008337765277, "learning_rate": 4.7366165741699347e-07, "logits/chosen": -0.9885573387145996, "logits/rejected": -1.0534632205963135, "logps/chosen": -458.31378173828125, "logps/rejected": -540.1591186523438, "loss": 0.4747, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3210374116897583, "rewards/margins": 1.1498852968215942, "rewards/rejected": -2.4709229469299316, "step": 970 }, { "epoch": 0.23512476007677544, "grad_norm": 15.150583413082405, "learning_rate": 4.727183511074401e-07, "logits/chosen": -1.309410810470581, "logits/rejected": -1.3342196941375732, "logps/chosen": -416.07293701171875, "logps/rejected": -466.21923828125, "loss": 0.4945, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2828443050384521, "rewards/margins": 0.6490219235420227, "rewards/rejected": -1.9318662881851196, "step": 980 }, { "epoch": 0.2375239923224568, "grad_norm": 11.648714128278801, "learning_rate": 4.717594219904043e-07, "logits/chosen": -1.0633580684661865, "logits/rejected": -1.179321527481079, "logps/chosen": -393.45648193359375, "logps/rejected": -493.12603759765625, "loss": 0.5057, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2883937358856201, "rewards/margins": 1.364997148513794, "rewards/rejected": -2.653390884399414, "step": 990 }, { "epoch": 0.2399232245681382, "grad_norm": 10.483921422479957, "learning_rate": 4.7078493733095393e-07, "logits/chosen": -1.0601375102996826, "logits/rejected": -1.1300022602081299, "logps/chosen": -432.314208984375, "logps/rejected": -605.5133056640625, "loss": 0.4784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7640736103057861, "rewards/margins": 1.7046064138412476, "rewards/rejected": -3.4686806201934814, "step": 1000 }, { "epoch": 0.2423224568138196, "grad_norm": 17.060662316036694, "learning_rate": 4.6979496548531614e-07, "logits/chosen": -1.243939757347107, "logits/rejected": -1.223625898361206, "logps/chosen": -446.4366760253906, "logps/rejected": -638.4450073242188, "loss": 0.5126, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8022911548614502, "rewards/margins": 1.4770129919052124, "rewards/rejected": -3.279303789138794, "step": 1010 }, { "epoch": 0.24472168905950095, "grad_norm": 12.903823112622515, "learning_rate": 4.6878957589608293e-07, "logits/chosen": -1.1194379329681396, "logits/rejected": -1.0698894262313843, "logps/chosen": -409.68603515625, "logps/rejected": -603.4207763671875, "loss": 0.5214, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4294629096984863, "rewards/margins": 1.5245181322097778, "rewards/rejected": -2.9539809226989746, "step": 1020 }, { "epoch": 0.24712092130518235, "grad_norm": 10.384415695069938, "learning_rate": 4.6776883908733956e-07, "logits/chosen": -1.1999337673187256, "logits/rejected": -1.311231255531311, "logps/chosen": -397.4037780761719, "logps/rejected": -460.95269775390625, "loss": 0.4774, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9981710314750671, "rewards/margins": 1.341308832168579, "rewards/rejected": -2.339479684829712, "step": 1030 }, { "epoch": 0.2495201535508637, "grad_norm": 14.761052654024631, "learning_rate": 4.667328266597178e-07, "logits/chosen": -1.1137980222702026, "logits/rejected": -1.1541672945022583, "logps/chosen": -391.23504638671875, "logps/rejected": -516.2595825195312, "loss": 0.4633, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3899133205413818, "rewards/margins": 1.2868678569793701, "rewards/rejected": -2.676781177520752, "step": 1040 }, { "epoch": 0.2519193857965451, "grad_norm": 10.417201545289524, "learning_rate": 4.6568161128537354e-07, "logits/chosen": -1.0899343490600586, "logits/rejected": -1.247287631034851, "logps/chosen": -420.63836669921875, "logps/rejected": -514.6077270507812, "loss": 0.4964, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5773645639419556, "rewards/margins": 1.506489872932434, "rewards/rejected": -3.0838541984558105, "step": 1050 }, { "epoch": 0.2543186180422265, "grad_norm": 16.488420937432345, "learning_rate": 4.6461526670288877e-07, "logits/chosen": -1.1203404664993286, "logits/rejected": -1.1449543237686157, "logps/chosen": -407.5829162597656, "logps/rejected": -497.4512634277344, "loss": 0.493, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3892755508422852, "rewards/margins": 1.1714465618133545, "rewards/rejected": -2.5607221126556396, "step": 1060 }, { "epoch": 0.2567178502879079, "grad_norm": 9.466776525081485, "learning_rate": 4.635338677120994e-07, "logits/chosen": -1.3948705196380615, "logits/rejected": -1.3905651569366455, "logps/chosen": -379.10009765625, "logps/rejected": -558.11328125, "loss": 0.4566, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.116693139076233, "rewards/margins": 1.6451349258422852, "rewards/rejected": -2.7618279457092285, "step": 1070 }, { "epoch": 0.2591170825335892, "grad_norm": 14.2505130371337, "learning_rate": 4.6243749016884835e-07, "logits/chosen": -1.1959508657455444, "logits/rejected": -1.2822418212890625, "logps/chosen": -459.30999755859375, "logps/rejected": -783.4131469726562, "loss": 0.5047, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8917354345321655, "rewards/margins": 2.781670331954956, "rewards/rejected": -4.673405647277832, "step": 1080 }, { "epoch": 0.2615163147792706, "grad_norm": 16.476931783493725, "learning_rate": 4.613262109796645e-07, "logits/chosen": -1.2336928844451904, "logits/rejected": -1.1787619590759277, "logps/chosen": -439.15838623046875, "logps/rejected": -729.7471313476562, "loss": 0.4727, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9186598062515259, "rewards/margins": 2.601553440093994, "rewards/rejected": -4.5202131271362305, "step": 1090 }, { "epoch": 0.263915547024952, "grad_norm": 12.359629943883897, "learning_rate": 4.602001080963678e-07, "logits/chosen": -1.1227662563323975, "logits/rejected": -1.1912363767623901, "logps/chosen": -392.65057373046875, "logps/rejected": -619.9832763671875, "loss": 0.4556, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2552802562713623, "rewards/margins": 2.460552215576172, "rewards/rejected": -3.715832233428955, "step": 1100 }, { "epoch": 0.2663147792706334, "grad_norm": 19.000478036791087, "learning_rate": 4.590592605106017e-07, "logits/chosen": -1.1630818843841553, "logits/rejected": -1.2058923244476318, "logps/chosen": -423.0645446777344, "logps/rejected": -619.4017333984375, "loss": 0.5008, "rewards/accuracies": 0.875, "rewards/chosen": -1.3737356662750244, "rewards/margins": 2.270371198654175, "rewards/rejected": -3.6441073417663574, "step": 1110 }, { "epoch": 0.2687140115163148, "grad_norm": 11.988638837694287, "learning_rate": 4.5790374824829165e-07, "logits/chosen": -1.1366350650787354, "logits/rejected": -1.2177644968032837, "logps/chosen": -313.95233154296875, "logps/rejected": -553.4268798828125, "loss": 0.5165, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.2247995138168335, "rewards/margins": 2.2755985260009766, "rewards/rejected": -3.5003979206085205, "step": 1120 }, { "epoch": 0.27111324376199614, "grad_norm": 13.389751916846311, "learning_rate": 4.5673365236403216e-07, "logits/chosen": -1.1376798152923584, "logits/rejected": -1.2225024700164795, "logps/chosen": -418.84930419921875, "logps/rejected": -644.3150024414062, "loss": 0.4922, "rewards/accuracies": 0.75, "rewards/chosen": -2.1243631839752197, "rewards/margins": 1.9992843866348267, "rewards/rejected": -4.123647212982178, "step": 1130 }, { "epoch": 0.27351247600767753, "grad_norm": 12.066478642007997, "learning_rate": 4.5554905493540075e-07, "logits/chosen": -1.393936038017273, "logits/rejected": -1.4359791278839111, "logps/chosen": -366.07171630859375, "logps/rejected": -646.8983154296875, "loss": 0.4248, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4785449504852295, "rewards/margins": 2.75530743598938, "rewards/rejected": -4.233852386474609, "step": 1140 }, { "epoch": 0.2759117082533589, "grad_norm": 13.594931228595337, "learning_rate": 4.5435003905720074e-07, "logits/chosen": -1.3401994705200195, "logits/rejected": -1.4189189672470093, "logps/chosen": -555.1502685546875, "logps/rejected": -733.55908203125, "loss": 0.4941, "rewards/accuracies": 0.875, "rewards/chosen": -2.782365560531616, "rewards/margins": 2.0489964485168457, "rewards/rejected": -4.831361770629883, "step": 1150 }, { "epoch": 0.2783109404990403, "grad_norm": 10.959828441128586, "learning_rate": 4.531366888356324e-07, "logits/chosen": -1.3572887182235718, "logits/rejected": -1.316543459892273, "logps/chosen": -379.75048828125, "logps/rejected": -709.0888061523438, "loss": 0.421, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.861644983291626, "rewards/margins": 2.909771680831909, "rewards/rejected": -4.771416664123535, "step": 1160 }, { "epoch": 0.2807101727447217, "grad_norm": 11.9806643935228, "learning_rate": 4.519090893823931e-07, "logits/chosen": -1.2535573244094849, "logits/rejected": -1.2999234199523926, "logps/chosen": -466.22979736328125, "logps/rejected": -587.1083984375, "loss": 0.4706, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.177558422088623, "rewards/margins": 1.3151285648345947, "rewards/rejected": -3.4926867485046387, "step": 1170 }, { "epoch": 0.28310940499040305, "grad_norm": 15.64676518237564, "learning_rate": 4.5066732680870734e-07, "logits/chosen": -1.1491421461105347, "logits/rejected": -1.2824984788894653, "logps/chosen": -410.51129150390625, "logps/rejected": -574.32861328125, "loss": 0.4552, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5698862075805664, "rewards/margins": 2.0116872787475586, "rewards/rejected": -3.581573486328125, "step": 1180 }, { "epoch": 0.28550863723608444, "grad_norm": 14.866463710844934, "learning_rate": 4.494114882192862e-07, "logits/chosen": -1.1980191469192505, "logits/rejected": -1.2435463666915894, "logps/chosen": -425.57733154296875, "logps/rejected": -676.6328735351562, "loss": 0.456, "rewards/accuracies": 0.875, "rewards/chosen": -1.72724187374115, "rewards/margins": 2.7374391555786133, "rewards/rejected": -4.4646806716918945, "step": 1190 }, { "epoch": 0.28790786948176583, "grad_norm": 13.193324388633975, "learning_rate": 4.4814166170621735e-07, "logits/chosen": -1.4249297380447388, "logits/rejected": -1.497604489326477, "logps/chosen": -480.2257385253906, "logps/rejected": -622.1727294921875, "loss": 0.4656, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2823410034179688, "rewards/margins": 1.655609130859375, "rewards/rejected": -3.9379496574401855, "step": 1200 }, { "epoch": 0.2903071017274472, "grad_norm": 20.768443614327058, "learning_rate": 4.468579363427858e-07, "logits/chosen": -1.2781771421432495, "logits/rejected": -1.3352419137954712, "logps/chosen": -420.2730407714844, "logps/rejected": -640.3319091796875, "loss": 0.4615, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5896456241607666, "rewards/margins": 2.4936366081237793, "rewards/rejected": -4.083281517028809, "step": 1210 }, { "epoch": 0.2927063339731286, "grad_norm": 9.971205156379035, "learning_rate": 4.4556040217722555e-07, "logits/chosen": -1.236127257347107, "logits/rejected": -1.2078077793121338, "logps/chosen": -356.2068786621094, "logps/rejected": -533.8853759765625, "loss": 0.4674, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.1397576332092285, "rewards/margins": 1.5811337232589722, "rewards/rejected": -2.7208914756774902, "step": 1220 }, { "epoch": 0.29510556621880996, "grad_norm": 12.163666083646278, "learning_rate": 4.442491502264033e-07, "logits/chosen": -1.1941020488739014, "logits/rejected": -1.2168632745742798, "logps/chosen": -363.7603454589844, "logps/rejected": -455.57867431640625, "loss": 0.4612, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3447378873825073, "rewards/margins": 1.0612128973007202, "rewards/rejected": -2.4059505462646484, "step": 1230 }, { "epoch": 0.29750479846449135, "grad_norm": 9.273351878722064, "learning_rate": 4.429242724694338e-07, "logits/chosen": -1.3338253498077393, "logits/rejected": -1.336753010749817, "logps/chosen": -395.6485595703125, "logps/rejected": -632.2241821289062, "loss": 0.454, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4914841651916504, "rewards/margins": 2.230437994003296, "rewards/rejected": -3.7219223976135254, "step": 1240 }, { "epoch": 0.29990403071017274, "grad_norm": 25.05777651266455, "learning_rate": 4.4158586184122817e-07, "logits/chosen": -1.1749566793441772, "logits/rejected": -1.271209478378296, "logps/chosen": -429.14306640625, "logps/rejected": -604.9749755859375, "loss": 0.4598, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.3869397640228271, "rewards/margins": 2.094210386276245, "rewards/rejected": -3.4811503887176514, "step": 1250 }, { "epoch": 0.30230326295585414, "grad_norm": 17.369219336254258, "learning_rate": 4.4023401222597443e-07, "logits/chosen": -1.027753233909607, "logits/rejected": -1.1785060167312622, "logps/chosen": -425.7173767089844, "logps/rejected": -544.3839721679688, "loss": 0.4667, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5711647272109985, "rewards/margins": 1.3786556720733643, "rewards/rejected": -2.9498205184936523, "step": 1260 }, { "epoch": 0.30470249520153553, "grad_norm": 11.861504310327616, "learning_rate": 4.3886881845055235e-07, "logits/chosen": -1.1897116899490356, "logits/rejected": -1.299993872642517, "logps/chosen": -375.5953674316406, "logps/rejected": -659.8858032226562, "loss": 0.4618, "rewards/accuracies": 0.875, "rewards/chosen": -1.3118809461593628, "rewards/margins": 2.890842914581299, "rewards/rejected": -4.202723979949951, "step": 1270 }, { "epoch": 0.30710172744721687, "grad_norm": 10.8218616423801, "learning_rate": 4.374903762778814e-07, "logits/chosen": -1.3571842908859253, "logits/rejected": -1.385545253753662, "logps/chosen": -494.83709716796875, "logps/rejected": -658.3067016601562, "loss": 0.4837, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.376838445663452, "rewards/margins": 1.8850828409194946, "rewards/rejected": -4.261920928955078, "step": 1280 }, { "epoch": 0.30950095969289826, "grad_norm": 10.14042165637657, "learning_rate": 4.3609878240020356e-07, "logits/chosen": -1.2036101818084717, "logits/rejected": -1.3167588710784912, "logps/chosen": -486.25811767578125, "logps/rejected": -651.5155639648438, "loss": 0.4531, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9182531833648682, "rewards/margins": 2.2668986320495605, "rewards/rejected": -4.18515157699585, "step": 1290 }, { "epoch": 0.31190019193857965, "grad_norm": 11.299969604518076, "learning_rate": 4.346941344323005e-07, "logits/chosen": -1.376947045326233, "logits/rejected": -1.4665632247924805, "logps/chosen": -451.6893615722656, "logps/rejected": -499.0320739746094, "loss": 0.4913, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.929490089416504, "rewards/margins": 1.0548813343048096, "rewards/rejected": -2.9843716621398926, "step": 1300 }, { "epoch": 0.31429942418426104, "grad_norm": 11.980383531649544, "learning_rate": 4.332765309046467e-07, "logits/chosen": -1.332617998123169, "logits/rejected": -1.3639962673187256, "logps/chosen": -426.499755859375, "logps/rejected": -626.6729736328125, "loss": 0.474, "rewards/accuracies": 0.75, "rewards/chosen": -1.678342580795288, "rewards/margins": 2.378535747528076, "rewards/rejected": -4.056878566741943, "step": 1310 }, { "epoch": 0.31669865642994244, "grad_norm": 14.013248425816212, "learning_rate": 4.3184607125649754e-07, "logits/chosen": -1.2517986297607422, "logits/rejected": -1.3113311529159546, "logps/chosen": -414.6436462402344, "logps/rejected": -657.0100708007812, "loss": 0.4847, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3406541347503662, "rewards/margins": 2.270404815673828, "rewards/rejected": -3.6110591888427734, "step": 1320 }, { "epoch": 0.3190978886756238, "grad_norm": 15.333659280580797, "learning_rate": 4.304028558289141e-07, "logits/chosen": -1.4721230268478394, "logits/rejected": -1.506519079208374, "logps/chosen": -451.8147888183594, "logps/rejected": -667.3796997070312, "loss": 0.448, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6051162481307983, "rewards/margins": 2.3636975288391113, "rewards/rejected": -3.96881365776062, "step": 1330 }, { "epoch": 0.32149712092130517, "grad_norm": 12.587601683578118, "learning_rate": 4.28946985857725e-07, "logits/chosen": -1.5413029193878174, "logits/rejected": -1.5675899982452393, "logps/chosen": -508.3623046875, "logps/rejected": -787.8271484375, "loss": 0.4284, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.329979658126831, "rewards/margins": 2.8664588928222656, "rewards/rejected": -5.196438789367676, "step": 1340 }, { "epoch": 0.32389635316698656, "grad_norm": 11.206457061422094, "learning_rate": 4.2747856346642445e-07, "logits/chosen": -1.1610701084136963, "logits/rejected": -1.1690763235092163, "logps/chosen": -401.5397033691406, "logps/rejected": -605.7808227539062, "loss": 0.4051, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7380218505859375, "rewards/margins": 2.2142810821533203, "rewards/rejected": -3.952302932739258, "step": 1350 }, { "epoch": 0.32629558541266795, "grad_norm": 20.124180714207323, "learning_rate": 4.2599769165900933e-07, "logits/chosen": -1.1436104774475098, "logits/rejected": -1.193645715713501, "logps/chosen": -501.67706298828125, "logps/rejected": -850.97314453125, "loss": 0.4949, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.594907283782959, "rewards/margins": 3.6968417167663574, "rewards/rejected": -6.291749000549316, "step": 1360 }, { "epoch": 0.32869481765834935, "grad_norm": 8.824763672957205, "learning_rate": 4.245044743127535e-07, "logits/chosen": -1.2460205554962158, "logits/rejected": -1.1843246221542358, "logps/chosen": -402.1271667480469, "logps/rejected": -616.098388671875, "loss": 0.4785, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.439296007156372, "rewards/margins": 2.2133936882019043, "rewards/rejected": -3.6526896953582764, "step": 1370 }, { "epoch": 0.3310940499040307, "grad_norm": 14.598877417461923, "learning_rate": 4.229990161709214e-07, "logits/chosen": -1.2217421531677246, "logits/rejected": -1.130197286605835, "logps/chosen": -352.7733459472656, "logps/rejected": -632.5406494140625, "loss": 0.463, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.2021641731262207, "rewards/margins": 2.610610246658325, "rewards/rejected": -3.8127739429473877, "step": 1380 }, { "epoch": 0.3334932821497121, "grad_norm": 11.865002342507843, "learning_rate": 4.214814228354204e-07, "logits/chosen": -1.342158555984497, "logits/rejected": -1.390978217124939, "logps/chosen": -457.16705322265625, "logps/rejected": -833.06787109375, "loss": 0.4364, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.8238433599472046, "rewards/margins": 3.916942596435547, "rewards/rejected": -5.740786075592041, "step": 1390 }, { "epoch": 0.33589251439539347, "grad_norm": 12.72331809654516, "learning_rate": 4.1995180075939375e-07, "logits/chosen": -1.4785504341125488, "logits/rejected": -1.4822914600372314, "logps/chosen": -445.6946716308594, "logps/rejected": -639.58837890625, "loss": 0.4643, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7562789916992188, "rewards/margins": 2.1691954135894775, "rewards/rejected": -3.925474166870117, "step": 1400 }, { "epoch": 0.33829174664107486, "grad_norm": 11.703724239093889, "learning_rate": 4.1841025723975297e-07, "logits/chosen": -1.1503616571426392, "logits/rejected": -1.1993227005004883, "logps/chosen": -395.007080078125, "logps/rejected": -638.5355224609375, "loss": 0.4344, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.1847431659698486, "rewards/margins": 2.6262497901916504, "rewards/rejected": -3.810993194580078, "step": 1410 }, { "epoch": 0.34069097888675626, "grad_norm": 19.639469762609966, "learning_rate": 4.168569004096516e-07, "logits/chosen": -1.2111709117889404, "logits/rejected": -1.1946176290512085, "logps/chosen": -405.03558349609375, "logps/rejected": -638.8396606445312, "loss": 0.4405, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7199838161468506, "rewards/margins": 2.166039228439331, "rewards/rejected": -3.886023759841919, "step": 1420 }, { "epoch": 0.3430902111324376, "grad_norm": 12.824816837365054, "learning_rate": 4.152918392308997e-07, "logits/chosen": -1.4239578247070312, "logits/rejected": -1.4245679378509521, "logps/chosen": -430.7586364746094, "logps/rejected": -618.7017822265625, "loss": 0.4354, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7669651508331299, "rewards/margins": 1.9917770624160767, "rewards/rejected": -3.758742094039917, "step": 1430 }, { "epoch": 0.345489443378119, "grad_norm": 17.48149790816653, "learning_rate": 4.137151834863213e-07, "logits/chosen": -1.307308554649353, "logits/rejected": -1.2468369007110596, "logps/chosen": -443.11370849609375, "logps/rejected": -816.6782836914062, "loss": 0.5016, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0893607139587402, "rewards/margins": 3.403632640838623, "rewards/rejected": -5.492993354797363, "step": 1440 }, { "epoch": 0.3478886756238004, "grad_norm": 13.277912286181408, "learning_rate": 4.121270437720526e-07, "logits/chosen": -1.192663550376892, "logits/rejected": -1.184259295463562, "logps/chosen": -402.0312194824219, "logps/rejected": -540.152587890625, "loss": 0.4536, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9467941522598267, "rewards/margins": 0.9955675005912781, "rewards/rejected": -2.942361354827881, "step": 1450 }, { "epoch": 0.3502879078694818, "grad_norm": 12.024084039884944, "learning_rate": 4.105275314897852e-07, "logits/chosen": -1.3382481336593628, "logits/rejected": -1.3211191892623901, "logps/chosen": -403.6156005859375, "logps/rejected": -820.3019409179688, "loss": 0.4487, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7442741394042969, "rewards/margins": 3.908841609954834, "rewards/rejected": -5.653115272521973, "step": 1460 }, { "epoch": 0.35268714011516317, "grad_norm": 10.938380648082374, "learning_rate": 4.089167588389508e-07, "logits/chosen": -1.0116260051727295, "logits/rejected": -1.1302238702774048, "logps/chosen": -524.224853515625, "logps/rejected": -730.9097900390625, "loss": 0.4594, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8979120254516602, "rewards/margins": 2.4903345108032227, "rewards/rejected": -4.388247489929199, "step": 1470 }, { "epoch": 0.3550863723608445, "grad_norm": 17.953634346330745, "learning_rate": 4.072948388088515e-07, "logits/chosen": -1.1399272680282593, "logits/rejected": -1.185240387916565, "logps/chosen": -480.47869873046875, "logps/rejected": -711.10107421875, "loss": 0.4749, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0881600379943848, "rewards/margins": 2.2733359336853027, "rewards/rejected": -4.3614959716796875, "step": 1480 }, { "epoch": 0.3574856046065259, "grad_norm": 15.479182257867892, "learning_rate": 4.056618851707334e-07, "logits/chosen": -1.2174913883209229, "logits/rejected": -1.3078429698944092, "logps/chosen": -448.73626708984375, "logps/rejected": -778.0083618164062, "loss": 0.4081, "rewards/accuracies": 0.875, "rewards/chosen": -1.7068407535552979, "rewards/margins": 3.3156890869140625, "rewards/rejected": -5.022529602050781, "step": 1490 }, { "epoch": 0.3598848368522073, "grad_norm": 12.258518175047803, "learning_rate": 4.0401801246980675e-07, "logits/chosen": -1.3552016019821167, "logits/rejected": -1.4195324182510376, "logps/chosen": -483.03607177734375, "logps/rejected": -829.8291015625, "loss": 0.4493, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.638827323913574, "rewards/margins": 3.691737651824951, "rewards/rejected": -6.330564498901367, "step": 1500 }, { "epoch": 0.3622840690978887, "grad_norm": 12.234674147688299, "learning_rate": 4.0236333601721043e-07, "logits/chosen": -1.267978310585022, "logits/rejected": -1.2512781620025635, "logps/chosen": -463.58636474609375, "logps/rejected": -589.7911376953125, "loss": 0.4873, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8561378717422485, "rewards/margins": 1.1768932342529297, "rewards/rejected": -3.0330309867858887, "step": 1510 }, { "epoch": 0.3646833013435701, "grad_norm": 14.004595080556923, "learning_rate": 4.0069797188192364e-07, "logits/chosen": -1.1959376335144043, "logits/rejected": -1.1977354288101196, "logps/chosen": -557.3121337890625, "logps/rejected": -894.349609375, "loss": 0.4741, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.671919345855713, "rewards/margins": 3.7750651836395264, "rewards/rejected": -6.446984767913818, "step": 1520 }, { "epoch": 0.3670825335892514, "grad_norm": 12.107627085595226, "learning_rate": 3.9902203688262417e-07, "logits/chosen": -1.2063888311386108, "logits/rejected": -1.294390082359314, "logps/chosen": -402.6402282714844, "logps/rejected": -547.6096801757812, "loss": 0.4266, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.433250904083252, "rewards/margins": 1.5607125759124756, "rewards/rejected": -2.9939634799957275, "step": 1530 }, { "epoch": 0.3694817658349328, "grad_norm": 17.020506697194886, "learning_rate": 3.9733564857949365e-07, "logits/chosen": -1.2349357604980469, "logits/rejected": -1.3519177436828613, "logps/chosen": -479.85992431640625, "logps/rejected": -652.7718505859375, "loss": 0.4142, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8144111633300781, "rewards/margins": 2.1435914039611816, "rewards/rejected": -3.958002805709839, "step": 1540 }, { "epoch": 0.3718809980806142, "grad_norm": 17.340247743555018, "learning_rate": 3.9563892526597177e-07, "logits/chosen": -1.342357873916626, "logits/rejected": -1.2987167835235596, "logps/chosen": -361.8048400878906, "logps/rejected": -495.60577392578125, "loss": 0.4381, "rewards/accuracies": 0.75, "rewards/chosen": -1.3809950351715088, "rewards/margins": 0.9643322825431824, "rewards/rejected": -2.345327138900757, "step": 1550 }, { "epoch": 0.3742802303262956, "grad_norm": 12.57132648844664, "learning_rate": 3.9393198596045795e-07, "logits/chosen": -1.2646602392196655, "logits/rejected": -1.227052927017212, "logps/chosen": -390.56402587890625, "logps/rejected": -566.3883666992188, "loss": 0.4795, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.529387354850769, "rewards/margins": 1.646651029586792, "rewards/rejected": -3.176038980484009, "step": 1560 }, { "epoch": 0.376679462571977, "grad_norm": 9.769186078821475, "learning_rate": 3.922149503979628e-07, "logits/chosen": -1.0911178588867188, "logits/rejected": -1.1452914476394653, "logps/chosen": -446.0682678222656, "logps/rejected": -891.4172973632812, "loss": 0.421, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7286949157714844, "rewards/margins": 4.412930011749268, "rewards/rejected": -6.141624927520752, "step": 1570 }, { "epoch": 0.3790786948176583, "grad_norm": 15.13387973536505, "learning_rate": 3.904879390217095e-07, "logits/chosen": -1.2228879928588867, "logits/rejected": -1.2863072156906128, "logps/chosen": -414.58251953125, "logps/rejected": -585.94189453125, "loss": 0.4376, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5823562145233154, "rewards/margins": 1.9467941522598267, "rewards/rejected": -3.529151201248169, "step": 1580 }, { "epoch": 0.3814779270633397, "grad_norm": 15.692773841294583, "learning_rate": 3.8875107297468463e-07, "logits/chosen": -1.1607332229614258, "logits/rejected": -1.1135156154632568, "logps/chosen": -394.6246032714844, "logps/rejected": -685.8594970703125, "loss": 0.4893, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5657716989517212, "rewards/margins": 2.4296836853027344, "rewards/rejected": -3.995455503463745, "step": 1590 }, { "epoch": 0.3838771593090211, "grad_norm": 12.22013342174461, "learning_rate": 3.87004474091141e-07, "logits/chosen": -1.0785077810287476, "logits/rejected": -1.145101547241211, "logps/chosen": -388.1426696777344, "logps/rejected": -562.6093139648438, "loss": 0.45, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6640561819076538, "rewards/margins": 1.593210220336914, "rewards/rejected": -3.2572665214538574, "step": 1600 }, { "epoch": 0.3862763915547025, "grad_norm": 12.125863891345588, "learning_rate": 3.8524826488805114e-07, "logits/chosen": -1.2912501096725464, "logits/rejected": -1.281894326210022, "logps/chosen": -448.44403076171875, "logps/rejected": -577.4244384765625, "loss": 0.4996, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6368385553359985, "rewards/margins": 1.7040477991104126, "rewards/rejected": -3.3408865928649902, "step": 1610 }, { "epoch": 0.3886756238003839, "grad_norm": 14.772144560930354, "learning_rate": 3.834825685565133e-07, "logits/chosen": -1.2755136489868164, "logits/rejected": -1.3778313398361206, "logps/chosen": -365.7793273925781, "logps/rejected": -445.6787109375, "loss": 0.4148, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2803051471710205, "rewards/margins": 1.1510274410247803, "rewards/rejected": -2.43133282661438, "step": 1620 }, { "epoch": 0.39107485604606523, "grad_norm": 19.441822036099662, "learning_rate": 3.8170750895311007e-07, "logits/chosen": -1.168717622756958, "logits/rejected": -1.1627219915390015, "logps/chosen": -418.5533752441406, "logps/rejected": -577.6104736328125, "loss": 0.4001, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3612974882125854, "rewards/margins": 1.8237950801849365, "rewards/rejected": -3.1850924491882324, "step": 1630 }, { "epoch": 0.3934740882917466, "grad_norm": 15.432355872695538, "learning_rate": 3.7992321059122045e-07, "logits/chosen": -1.1575825214385986, "logits/rejected": -1.2952228784561157, "logps/chosen": -471.82476806640625, "logps/rejected": -670.2379150390625, "loss": 0.4553, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2020068168640137, "rewards/margins": 2.1685726642608643, "rewards/rejected": -4.370579242706299, "step": 1640 }, { "epoch": 0.395873320537428, "grad_norm": 12.344277319747519, "learning_rate": 3.7812979863228576e-07, "logits/chosen": -1.3181376457214355, "logits/rejected": -1.3618929386138916, "logps/chosen": -485.92034912109375, "logps/rejected": -651.4783935546875, "loss": 0.4458, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.687350034713745, "rewards/margins": 1.5618057250976562, "rewards/rejected": -4.2491559982299805, "step": 1650 }, { "epoch": 0.3982725527831094, "grad_norm": 15.718103189453073, "learning_rate": 3.763273988770296e-07, "logits/chosen": -1.1789578199386597, "logits/rejected": -1.2662181854248047, "logps/chosen": -411.53680419921875, "logps/rejected": -600.5362548828125, "loss": 0.4555, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7146021127700806, "rewards/margins": 1.884450912475586, "rewards/rejected": -3.599053144454956, "step": 1660 }, { "epoch": 0.4006717850287908, "grad_norm": 12.906974103488265, "learning_rate": 3.7451613775663405e-07, "logits/chosen": -1.1591131687164307, "logits/rejected": -1.1079394817352295, "logps/chosen": -392.81610107421875, "logps/rejected": -686.8541259765625, "loss": 0.4617, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5098912715911865, "rewards/margins": 2.9055066108703613, "rewards/rejected": -4.415398120880127, "step": 1670 }, { "epoch": 0.40307101727447214, "grad_norm": 17.744184430696986, "learning_rate": 3.726961423238706e-07, "logits/chosen": -1.2879854440689087, "logits/rejected": -1.2753901481628418, "logps/chosen": -382.4233703613281, "logps/rejected": -630.2597045898438, "loss": 0.4446, "rewards/accuracies": 0.75, "rewards/chosen": -1.5524839162826538, "rewards/margins": 2.3134608268737793, "rewards/rejected": -3.8659446239471436, "step": 1680 }, { "epoch": 0.40547024952015354, "grad_norm": 15.938147948338413, "learning_rate": 3.708675402441882e-07, "logits/chosen": -1.146555781364441, "logits/rejected": -1.3221074342727661, "logps/chosen": -438.2669372558594, "logps/rejected": -592.5673217773438, "loss": 0.4849, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6606773138046265, "rewards/margins": 1.8763577938079834, "rewards/rejected": -3.5370349884033203, "step": 1690 }, { "epoch": 0.40786948176583493, "grad_norm": 15.397723594978256, "learning_rate": 3.6903045978675775e-07, "logits/chosen": -1.1809333562850952, "logits/rejected": -1.2159626483917236, "logps/chosen": -393.14300537109375, "logps/rejected": -634.4575805664062, "loss": 0.4445, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5431654453277588, "rewards/margins": 2.6182663440704346, "rewards/rejected": -4.161431789398193, "step": 1700 }, { "epoch": 0.4102687140115163, "grad_norm": 12.520789981032921, "learning_rate": 3.6718502981547474e-07, "logits/chosen": -1.2585941553115845, "logits/rejected": -1.2499480247497559, "logps/chosen": -395.173828125, "logps/rejected": -598.7399291992188, "loss": 0.4262, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.478262186050415, "rewards/margins": 1.6322393417358398, "rewards/rejected": -3.110501527786255, "step": 1710 }, { "epoch": 0.4126679462571977, "grad_norm": 14.28113094156497, "learning_rate": 3.6533137977991986e-07, "logits/chosen": -1.1053855419158936, "logits/rejected": -1.1341431140899658, "logps/chosen": -424.7064514160156, "logps/rejected": -587.2310791015625, "loss": 0.5011, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4273463487625122, "rewards/margins": 1.3417994976043701, "rewards/rejected": -2.769145965576172, "step": 1720 }, { "epoch": 0.41506717850287905, "grad_norm": 11.265817004608927, "learning_rate": 3.6346963970627865e-07, "logits/chosen": -1.1037083864212036, "logits/rejected": -1.0224764347076416, "logps/chosen": -393.5522766113281, "logps/rejected": -614.2374877929688, "loss": 0.4456, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5364679098129272, "rewards/margins": 2.105459690093994, "rewards/rejected": -3.641927719116211, "step": 1730 }, { "epoch": 0.41746641074856045, "grad_norm": 11.977913591571605, "learning_rate": 3.615999401882207e-07, "logits/chosen": -1.3552181720733643, "logits/rejected": -1.3371044397354126, "logps/chosen": -418.01397705078125, "logps/rejected": -775.3851318359375, "loss": 0.453, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.8135935068130493, "rewards/margins": 3.5035622119903564, "rewards/rejected": -5.317155361175537, "step": 1740 }, { "epoch": 0.41986564299424184, "grad_norm": 11.503365691015834, "learning_rate": 3.597224123777389e-07, "logits/chosen": -1.2816386222839355, "logits/rejected": -1.2878687381744385, "logps/chosen": -511.79852294921875, "logps/rejected": -887.9742431640625, "loss": 0.447, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.595597743988037, "rewards/margins": 3.664611339569092, "rewards/rejected": -6.260209083557129, "step": 1750 }, { "epoch": 0.42226487523992323, "grad_norm": 16.17377166072833, "learning_rate": 3.5783718797595e-07, "logits/chosen": -1.2984836101531982, "logits/rejected": -1.399364709854126, "logps/chosen": -505.53204345703125, "logps/rejected": -702.7299194335938, "loss": 0.4559, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.0332143306732178, "rewards/margins": 2.416463851928711, "rewards/rejected": -4.449678421020508, "step": 1760 }, { "epoch": 0.4246641074856046, "grad_norm": 12.49839489285334, "learning_rate": 3.559443992238558e-07, "logits/chosen": -1.3506227731704712, "logits/rejected": -1.409407615661621, "logps/chosen": -397.6769714355469, "logps/rejected": -840.98095703125, "loss": 0.4406, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.526439905166626, "rewards/margins": 4.196308135986328, "rewards/rejected": -5.722747802734375, "step": 1770 }, { "epoch": 0.42706333973128596, "grad_norm": 10.138858801029569, "learning_rate": 3.540441788930673e-07, "logits/chosen": -1.3410276174545288, "logits/rejected": -1.414222002029419, "logps/chosen": -491.10986328125, "logps/rejected": -747.819091796875, "loss": 0.4169, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9301306009292603, "rewards/margins": 2.9560461044311523, "rewards/rejected": -4.886176109313965, "step": 1780 }, { "epoch": 0.42946257197696736, "grad_norm": 14.446911222851618, "learning_rate": 3.5213666027649123e-07, "logits/chosen": -1.3879473209381104, "logits/rejected": -1.5012261867523193, "logps/chosen": -495.0065002441406, "logps/rejected": -607.099853515625, "loss": 0.4593, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.185396194458008, "rewards/margins": 1.5175437927246094, "rewards/rejected": -3.702939510345459, "step": 1790 }, { "epoch": 0.43186180422264875, "grad_norm": 15.239290825165414, "learning_rate": 3.5022197717898017e-07, "logits/chosen": -1.2657089233398438, "logits/rejected": -1.4074336290359497, "logps/chosen": -394.63385009765625, "logps/rejected": -730.5569458007812, "loss": 0.3917, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7406047582626343, "rewards/margins": 3.7480709552764893, "rewards/rejected": -5.488675594329834, "step": 1800 }, { "epoch": 0.43426103646833014, "grad_norm": 18.539564378032264, "learning_rate": 3.4830026390794633e-07, "logits/chosen": -1.373583436012268, "logits/rejected": -1.449741244316101, "logps/chosen": -525.2714233398438, "logps/rejected": -705.874755859375, "loss": 0.4023, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.3716514110565186, "rewards/margins": 2.285885810852051, "rewards/rejected": -4.657536506652832, "step": 1810 }, { "epoch": 0.43666026871401153, "grad_norm": 15.392536337629252, "learning_rate": 3.4637165526394104e-07, "logits/chosen": -1.4993913173675537, "logits/rejected": -1.5507056713104248, "logps/chosen": -438.17315673828125, "logps/rejected": -669.3710327148438, "loss": 0.4354, "rewards/accuracies": 0.875, "rewards/chosen": -2.0398783683776855, "rewards/margins": 2.2368016242980957, "rewards/rejected": -4.276679515838623, "step": 1820 }, { "epoch": 0.43905950095969287, "grad_norm": 10.083282846912516, "learning_rate": 3.4443628653119814e-07, "logits/chosen": -1.2605036497116089, "logits/rejected": -1.272882103919983, "logps/chosen": -459.35870361328125, "logps/rejected": -793.3179931640625, "loss": 0.4781, "rewards/accuracies": 0.875, "rewards/chosen": -1.9037319421768188, "rewards/margins": 2.950524091720581, "rewards/rejected": -4.854256629943848, "step": 1830 }, { "epoch": 0.44145873320537427, "grad_norm": 17.777066285729536, "learning_rate": 3.424942934681453e-07, "logits/chosen": -1.2640819549560547, "logits/rejected": -1.4133803844451904, "logps/chosen": -365.7422790527344, "logps/rejected": -581.9953002929688, "loss": 0.4291, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3039973974227905, "rewards/margins": 2.1755900382995605, "rewards/rejected": -3.4795870780944824, "step": 1840 }, { "epoch": 0.44385796545105566, "grad_norm": 21.190089730860404, "learning_rate": 3.405458122978804e-07, "logits/chosen": -1.2760121822357178, "logits/rejected": -1.3037431240081787, "logps/chosen": -427.647216796875, "logps/rejected": -589.5700073242188, "loss": 0.4056, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.554579496383667, "rewards/margins": 1.9800221920013428, "rewards/rejected": -3.5346016883850098, "step": 1850 }, { "epoch": 0.44625719769673705, "grad_norm": 19.52879089362984, "learning_rate": 3.3859097969861633e-07, "logits/chosen": -1.2224397659301758, "logits/rejected": -1.2577435970306396, "logps/chosen": -464.634033203125, "logps/rejected": -659.4881591796875, "loss": 0.4462, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7432029247283936, "rewards/margins": 2.287348508834839, "rewards/rejected": -4.030551433563232, "step": 1860 }, { "epoch": 0.44865642994241844, "grad_norm": 15.71172140020582, "learning_rate": 3.366299327940936e-07, "logits/chosen": -1.254504680633545, "logits/rejected": -1.188957929611206, "logps/chosen": -473.3033142089844, "logps/rejected": -726.52392578125, "loss": 0.4147, "rewards/accuracies": 0.875, "rewards/chosen": -1.8812453746795654, "rewards/margins": 2.339358329772949, "rewards/rejected": -4.2206034660339355, "step": 1870 }, { "epoch": 0.4510556621880998, "grad_norm": 13.22641361891775, "learning_rate": 3.3466280914396117e-07, "logits/chosen": -1.2382128238677979, "logits/rejected": -1.2535500526428223, "logps/chosen": -443.01007080078125, "logps/rejected": -681.6619262695312, "loss": 0.4319, "rewards/accuracies": 0.75, "rewards/chosen": -2.062727212905884, "rewards/margins": 2.200326919555664, "rewards/rejected": -4.2630534172058105, "step": 1880 }, { "epoch": 0.4534548944337812, "grad_norm": 13.21370692192793, "learning_rate": 3.326897467341281e-07, "logits/chosen": -1.234937310218811, "logits/rejected": -1.3355581760406494, "logps/chosen": -445.6819763183594, "logps/rejected": -728.594970703125, "loss": 0.4254, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.3737926483154297, "rewards/margins": 2.7364139556884766, "rewards/rejected": -5.110206127166748, "step": 1890 }, { "epoch": 0.45585412667946257, "grad_norm": 13.78564096806611, "learning_rate": 3.3071088396708335e-07, "logits/chosen": -1.2770905494689941, "logits/rejected": -1.240928292274475, "logps/chosen": -391.79736328125, "logps/rejected": -723.361083984375, "loss": 0.4619, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.881150484085083, "rewards/margins": 3.0526633262634277, "rewards/rejected": -4.93381404876709, "step": 1900 }, { "epoch": 0.45825335892514396, "grad_norm": 13.853814670591044, "learning_rate": 3.2872635965218824e-07, "logits/chosen": -1.2004590034484863, "logits/rejected": -1.2370150089263916, "logps/chosen": -512.8619995117188, "logps/rejected": -702.7752685546875, "loss": 0.4878, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.5109477043151855, "rewards/margins": 1.8365228176116943, "rewards/rejected": -4.347470283508301, "step": 1910 }, { "epoch": 0.46065259117082535, "grad_norm": 10.692863270526209, "learning_rate": 3.2673631299593905e-07, "logits/chosen": -1.173227310180664, "logits/rejected": -1.3409078121185303, "logps/chosen": -474.7144470214844, "logps/rejected": -680.1087036132812, "loss": 0.4399, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.028782606124878, "rewards/margins": 2.1353182792663574, "rewards/rejected": -4.164100646972656, "step": 1920 }, { "epoch": 0.4630518234165067, "grad_norm": 14.233116791502368, "learning_rate": 3.247408835922024e-07, "logits/chosen": -1.320565104484558, "logits/rejected": -1.2968575954437256, "logps/chosen": -573.8089599609375, "logps/rejected": -786.044189453125, "loss": 0.4368, "rewards/accuracies": 0.75, "rewards/chosen": -2.6720101833343506, "rewards/margins": 2.062215566635132, "rewards/rejected": -4.734226226806641, "step": 1930 }, { "epoch": 0.4654510556621881, "grad_norm": 15.685451696438584, "learning_rate": 3.2274021141242306e-07, "logits/chosen": -1.255118489265442, "logits/rejected": -1.320111870765686, "logps/chosen": -484.41583251953125, "logps/rejected": -714.25244140625, "loss": 0.4476, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.30792236328125, "rewards/margins": 2.1702935695648193, "rewards/rejected": -4.47821569442749, "step": 1940 }, { "epoch": 0.4678502879078695, "grad_norm": 16.009709587203158, "learning_rate": 3.2073443679580613e-07, "logits/chosen": -1.112594723701477, "logits/rejected": -1.2054228782653809, "logps/chosen": -423.2367248535156, "logps/rejected": -543.6708984375, "loss": 0.4427, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6221225261688232, "rewards/margins": 1.2181508541107178, "rewards/rejected": -2.840273380279541, "step": 1950 }, { "epoch": 0.47024952015355087, "grad_norm": 11.690186448497471, "learning_rate": 3.1872370043947194e-07, "logits/chosen": -1.2797791957855225, "logits/rejected": -1.3246345520019531, "logps/chosen": -414.15234375, "logps/rejected": -695.158203125, "loss": 0.4033, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4832648038864136, "rewards/margins": 2.9206230640411377, "rewards/rejected": -4.40388822555542, "step": 1960 }, { "epoch": 0.47264875239923226, "grad_norm": 20.104770187290267, "learning_rate": 3.167081433885874e-07, "logits/chosen": -1.030788779258728, "logits/rejected": -1.1015684604644775, "logps/chosen": -513.7273559570312, "logps/rejected": -811.976318359375, "loss": 0.3855, "rewards/accuracies": 0.75, "rewards/chosen": -2.102285623550415, "rewards/margins": 2.593528985977173, "rewards/rejected": -4.695814609527588, "step": 1970 }, { "epoch": 0.4750479846449136, "grad_norm": 22.35702689198335, "learning_rate": 3.14687907026472e-07, "logits/chosen": -1.1791191101074219, "logits/rejected": -1.2959524393081665, "logps/chosen": -441.61846923828125, "logps/rejected": -704.8765869140625, "loss": 0.409, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.2501838207244873, "rewards/margins": 2.3681230545043945, "rewards/rejected": -4.618307113647461, "step": 1980 }, { "epoch": 0.477447216890595, "grad_norm": 16.66080453274856, "learning_rate": 3.126631330646801e-07, "logits/chosen": -1.2385426759719849, "logits/rejected": -1.3393757343292236, "logps/chosen": -572.3016357421875, "logps/rejected": -789.0521850585938, "loss": 0.4564, "rewards/accuracies": 0.75, "rewards/chosen": -2.771270275115967, "rewards/margins": 2.124401807785034, "rewards/rejected": -4.895671844482422, "step": 1990 }, { "epoch": 0.4798464491362764, "grad_norm": 13.605651174190195, "learning_rate": 3.1063396353306097e-07, "logits/chosen": -1.2395048141479492, "logits/rejected": -1.3828445672988892, "logps/chosen": -438.51312255859375, "logps/rejected": -621.2369384765625, "loss": 0.4486, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7979936599731445, "rewards/margins": 2.1893625259399414, "rewards/rejected": -3.987356185913086, "step": 2000 }, { "epoch": 0.4822456813819578, "grad_norm": 13.924076058423626, "learning_rate": 3.0860054076979535e-07, "logits/chosen": -1.2428590059280396, "logits/rejected": -1.236566185951233, "logps/chosen": -467.281005859375, "logps/rejected": -621.5033569335938, "loss": 0.4229, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.916424036026001, "rewards/margins": 1.8854621648788452, "rewards/rejected": -3.8018863201141357, "step": 2010 }, { "epoch": 0.4846449136276392, "grad_norm": 10.659549298550333, "learning_rate": 3.065630074114115e-07, "logits/chosen": -1.2107280492782593, "logits/rejected": -1.2906858921051025, "logps/chosen": -437.7901916503906, "logps/rejected": -730.5772705078125, "loss": 0.4458, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5543664693832397, "rewards/margins": 3.3551418781280518, "rewards/rejected": -4.90950870513916, "step": 2020 }, { "epoch": 0.4870441458733205, "grad_norm": 11.89640175081092, "learning_rate": 3.0452150638277947e-07, "logits/chosen": -1.0864282846450806, "logits/rejected": -1.0442813634872437, "logps/chosen": -426.6461486816406, "logps/rejected": -630.3248291015625, "loss": 0.4587, "rewards/accuracies": 0.75, "rewards/chosen": -2.0740158557891846, "rewards/margins": 1.9162708520889282, "rewards/rejected": -3.9902865886688232, "step": 2030 }, { "epoch": 0.4894433781190019, "grad_norm": 11.776367189768084, "learning_rate": 3.024761808870856e-07, "logits/chosen": -1.307191014289856, "logits/rejected": -1.312293291091919, "logps/chosen": -370.92791748046875, "logps/rejected": -650.8178100585938, "loss": 0.3818, "rewards/accuracies": 0.875, "rewards/chosen": -1.276861310005188, "rewards/margins": 2.857835531234741, "rewards/rejected": -4.1346964836120605, "step": 2040 }, { "epoch": 0.4918426103646833, "grad_norm": 25.099913631834486, "learning_rate": 3.004271743957875e-07, "logits/chosen": -1.1278274059295654, "logits/rejected": -1.1356306076049805, "logps/chosen": -495.9942321777344, "logps/rejected": -648.2416381835938, "loss": 0.4424, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.38657808303833, "rewards/margins": 1.207850694656372, "rewards/rejected": -3.594428539276123, "step": 2050 }, { "epoch": 0.4942418426103647, "grad_norm": 12.705340632883466, "learning_rate": 2.983746306385499e-07, "logits/chosen": -1.3075058460235596, "logits/rejected": -1.2933059930801392, "logps/chosen": -434.21820068359375, "logps/rejected": -625.4400024414062, "loss": 0.4253, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8787765502929688, "rewards/margins": 1.7934147119522095, "rewards/rejected": -3.6721911430358887, "step": 2060 }, { "epoch": 0.4966410748560461, "grad_norm": 13.755685476664564, "learning_rate": 2.963186935931628e-07, "logits/chosen": -1.215529203414917, "logits/rejected": -1.2133440971374512, "logps/chosen": -403.08465576171875, "logps/rejected": -577.1477661132812, "loss": 0.3959, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.499050259590149, "rewards/margins": 1.7851158380508423, "rewards/rejected": -3.284165859222412, "step": 2070 }, { "epoch": 0.4990403071017274, "grad_norm": 13.292089170044676, "learning_rate": 2.9425950747544176e-07, "logits/chosen": -1.1674937009811401, "logits/rejected": -1.3074105978012085, "logps/chosen": -525.9224243164062, "logps/rejected": -763.0833740234375, "loss": 0.423, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.349907398223877, "rewards/margins": 2.648500919342041, "rewards/rejected": -4.998408317565918, "step": 2080 }, { "epoch": 0.5014395393474088, "grad_norm": 18.455895560701023, "learning_rate": 2.921972167291119e-07, "logits/chosen": -1.1788341999053955, "logits/rejected": -1.2539576292037964, "logps/chosen": -472.1956481933594, "logps/rejected": -670.5008544921875, "loss": 0.4338, "rewards/accuracies": 0.75, "rewards/chosen": -1.9172461032867432, "rewards/margins": 2.0367343425750732, "rewards/rejected": -3.9539802074432373, "step": 2090 }, { "epoch": 0.5038387715930902, "grad_norm": 13.471771310006998, "learning_rate": 2.9013196601567567e-07, "logits/chosen": -1.1498690843582153, "logits/rejected": -1.1755589246749878, "logps/chosen": -407.24163818359375, "logps/rejected": -575.2977905273438, "loss": 0.5056, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5899887084960938, "rewards/margins": 1.5084218978881836, "rewards/rejected": -3.0984106063842773, "step": 2100 }, { "epoch": 0.5062380038387716, "grad_norm": 15.761320703617217, "learning_rate": 2.8806390020426555e-07, "logits/chosen": -1.139108419418335, "logits/rejected": -1.1348917484283447, "logps/chosen": -422.84893798828125, "logps/rejected": -555.7877807617188, "loss": 0.4274, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4848568439483643, "rewards/margins": 1.3276598453521729, "rewards/rejected": -2.812516689300537, "step": 2110 }, { "epoch": 0.508637236084453, "grad_norm": 17.997023645014245, "learning_rate": 2.8599316436148187e-07, "logits/chosen": -1.2534973621368408, "logits/rejected": -1.2950940132141113, "logps/chosen": -413.71490478515625, "logps/rejected": -538.5707397460938, "loss": 0.446, "rewards/accuracies": 0.75, "rewards/chosen": -1.67266047000885, "rewards/margins": 1.2678474187850952, "rewards/rejected": -2.9405078887939453, "step": 2120 }, { "epoch": 0.5110364683301344, "grad_norm": 12.638370779776375, "learning_rate": 2.8391990374121723e-07, "logits/chosen": -1.288747787475586, "logits/rejected": -1.3166449069976807, "logps/chosen": -454.18359375, "logps/rejected": -724.0892333984375, "loss": 0.4079, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.0871493816375732, "rewards/margins": 2.4171881675720215, "rewards/rejected": -4.504337310791016, "step": 2130 }, { "epoch": 0.5134357005758158, "grad_norm": 19.25266010844361, "learning_rate": 2.818442637744669e-07, "logits/chosen": -1.332960844039917, "logits/rejected": -1.3756061792373657, "logps/chosen": -447.2140197753906, "logps/rejected": -657.7930297851562, "loss": 0.4256, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.023087978363037, "rewards/margins": 2.0285849571228027, "rewards/rejected": -4.05167293548584, "step": 2140 }, { "epoch": 0.5158349328214972, "grad_norm": 17.434973094318284, "learning_rate": 2.797663900591284e-07, "logits/chosen": -1.320966124534607, "logits/rejected": -1.4106026887893677, "logps/chosen": -476.49267578125, "logps/rejected": -632.2120361328125, "loss": 0.3841, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.1607255935668945, "rewards/margins": 1.7891597747802734, "rewards/rejected": -3.949885129928589, "step": 2150 }, { "epoch": 0.5182341650671785, "grad_norm": 17.474868196021465, "learning_rate": 2.776864283497874e-07, "logits/chosen": -1.2660505771636963, "logits/rejected": -1.3750221729278564, "logps/chosen": -442.905029296875, "logps/rejected": -755.1177978515625, "loss": 0.4058, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.1417970657348633, "rewards/margins": 3.1892342567443848, "rewards/rejected": -5.331031322479248, "step": 2160 }, { "epoch": 0.5206333973128598, "grad_norm": 14.837534970558897, "learning_rate": 2.756045245474943e-07, "logits/chosen": -1.1648900508880615, "logits/rejected": -1.1385093927383423, "logps/chosen": -472.541015625, "logps/rejected": -685.985107421875, "loss": 0.4347, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.109835147857666, "rewards/margins": 2.0320799350738525, "rewards/rejected": -4.1419148445129395, "step": 2170 }, { "epoch": 0.5230326295585412, "grad_norm": 12.732468692827648, "learning_rate": 2.7352082468952977e-07, "logits/chosen": -1.1894464492797852, "logits/rejected": -1.2552679777145386, "logps/chosen": -484.76409912109375, "logps/rejected": -814.096435546875, "loss": 0.4487, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.386613130569458, "rewards/margins": 3.1459081172943115, "rewards/rejected": -5.532520771026611, "step": 2180 }, { "epoch": 0.5254318618042226, "grad_norm": 15.991158601320864, "learning_rate": 2.7143547493916e-07, "logits/chosen": -1.2773798704147339, "logits/rejected": -1.2588646411895752, "logps/chosen": -438.006591796875, "logps/rejected": -779.8575439453125, "loss": 0.4461, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.8076789379119873, "rewards/margins": 3.383274793624878, "rewards/rejected": -5.190953254699707, "step": 2190 }, { "epoch": 0.527831094049904, "grad_norm": 15.0413189840848, "learning_rate": 2.693486215753853e-07, "logits/chosen": -1.2713805437088013, "logits/rejected": -1.2931923866271973, "logps/chosen": -474.3173828125, "logps/rejected": -771.8619384765625, "loss": 0.4418, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.2440552711486816, "rewards/margins": 3.1775963306427, "rewards/rejected": -5.421651840209961, "step": 2200 }, { "epoch": 0.5302303262955854, "grad_norm": 12.016086648025393, "learning_rate": 2.6726041098267805e-07, "logits/chosen": -1.1932638883590698, "logits/rejected": -1.2385377883911133, "logps/chosen": -468.5335998535156, "logps/rejected": -564.7030029296875, "loss": 0.4914, "rewards/accuracies": 0.625, "rewards/chosen": -1.7891786098480225, "rewards/margins": 1.3260656595230103, "rewards/rejected": -3.1152443885803223, "step": 2210 }, { "epoch": 0.5326295585412668, "grad_norm": 15.752188949724715, "learning_rate": 2.6517098964071507e-07, "logits/chosen": -1.2558810710906982, "logits/rejected": -1.3106260299682617, "logps/chosen": -388.71038818359375, "logps/rejected": -497.370361328125, "loss": 0.4766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3813974857330322, "rewards/margins": 0.8968814015388489, "rewards/rejected": -2.2782788276672363, "step": 2220 }, { "epoch": 0.5350287907869482, "grad_norm": 17.371358487239537, "learning_rate": 2.630805041141023e-07, "logits/chosen": -1.3759686946868896, "logits/rejected": -1.3909227848052979, "logps/chosen": -353.22833251953125, "logps/rejected": -674.2232666015625, "loss": 0.4387, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.275880217552185, "rewards/margins": 2.9993698596954346, "rewards/rejected": -4.275249481201172, "step": 2230 }, { "epoch": 0.5374280230326296, "grad_norm": 15.68257354415993, "learning_rate": 2.609891010420941e-07, "logits/chosen": -1.3854516744613647, "logits/rejected": -1.3968112468719482, "logps/chosen": -449.6177673339844, "logps/rejected": -670.1103515625, "loss": 0.4195, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7667739391326904, "rewards/margins": 2.2145369052886963, "rewards/rejected": -3.981311082839966, "step": 2240 }, { "epoch": 0.539827255278311, "grad_norm": 17.255086501798605, "learning_rate": 2.5889692712830674e-07, "logits/chosen": -1.211531400680542, "logits/rejected": -1.283849835395813, "logps/chosen": -389.5179138183594, "logps/rejected": -602.1033325195312, "loss": 0.3883, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6021864414215088, "rewards/margins": 2.1839447021484375, "rewards/rejected": -3.786130905151367, "step": 2250 }, { "epoch": 0.5422264875239923, "grad_norm": 23.697806480913663, "learning_rate": 2.5680412913042843e-07, "logits/chosen": -1.4293580055236816, "logits/rejected": -1.4167674779891968, "logps/chosen": -482.10028076171875, "logps/rejected": -786.3612060546875, "loss": 0.4341, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.2943575382232666, "rewards/margins": 3.070589065551758, "rewards/rejected": -5.3649468421936035, "step": 2260 }, { "epoch": 0.5446257197696737, "grad_norm": 20.5376691734358, "learning_rate": 2.5471085384992404e-07, "logits/chosen": -1.3646225929260254, "logits/rejected": -1.3295977115631104, "logps/chosen": -490.397705078125, "logps/rejected": -880.2039184570312, "loss": 0.4117, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.3713932037353516, "rewards/margins": 3.905015468597412, "rewards/rejected": -6.2764081954956055, "step": 2270 }, { "epoch": 0.5470249520153551, "grad_norm": 33.70464819485353, "learning_rate": 2.526172481217381e-07, "logits/chosen": -1.345577597618103, "logits/rejected": -1.318164587020874, "logps/chosen": -443.73492431640625, "logps/rejected": -621.4215698242188, "loss": 0.4513, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.3964602947235107, "rewards/margins": 1.5979655981063843, "rewards/rejected": -3.9944260120391846, "step": 2280 }, { "epoch": 0.5494241842610365, "grad_norm": 14.933194013231358, "learning_rate": 2.5052345880399456e-07, "logits/chosen": -1.3429863452911377, "logits/rejected": -1.419154405593872, "logps/chosen": -432.6609802246094, "logps/rejected": -600.9601440429688, "loss": 0.423, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0265350341796875, "rewards/margins": 1.6890513896942139, "rewards/rejected": -3.7155869007110596, "step": 2290 }, { "epoch": 0.5518234165067178, "grad_norm": 16.16343931231014, "learning_rate": 2.4842963276769555e-07, "logits/chosen": -1.3177053928375244, "logits/rejected": -1.2920299768447876, "logps/chosen": -412.7960510253906, "logps/rejected": -659.1737060546875, "loss": 0.4355, "rewards/accuracies": 0.875, "rewards/chosen": -1.99508535861969, "rewards/margins": 2.033306360244751, "rewards/rejected": -4.0283918380737305, "step": 2300 }, { "epoch": 0.5542226487523992, "grad_norm": 15.4620970327947, "learning_rate": 2.463359168864189e-07, "logits/chosen": -1.1979784965515137, "logits/rejected": -1.3762614727020264, "logps/chosen": -500.4812927246094, "logps/rejected": -641.75537109375, "loss": 0.4751, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.1022884845733643, "rewards/margins": 1.735335111618042, "rewards/rejected": -3.8376235961914062, "step": 2310 }, { "epoch": 0.5566218809980806, "grad_norm": 17.931731909002746, "learning_rate": 2.4424245802601555e-07, "logits/chosen": -1.2692848443984985, "logits/rejected": -1.2660033702850342, "logps/chosen": -370.6279296875, "logps/rejected": -573.5665283203125, "loss": 0.4162, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5542004108428955, "rewards/margins": 1.4760851860046387, "rewards/rejected": -3.0302860736846924, "step": 2320 }, { "epoch": 0.559021113243762, "grad_norm": 17.293463950337944, "learning_rate": 2.421494030343072e-07, "logits/chosen": -1.2170095443725586, "logits/rejected": -1.3737263679504395, "logps/chosen": -454.30224609375, "logps/rejected": -541.3650512695312, "loss": 0.5052, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.897562026977539, "rewards/margins": 1.3505643606185913, "rewards/rejected": -3.248126268386841, "step": 2330 }, { "epoch": 0.5614203454894434, "grad_norm": 14.543185750157452, "learning_rate": 2.400568987307861e-07, "logits/chosen": -1.2438604831695557, "logits/rejected": -1.3291311264038086, "logps/chosen": -413.92144775390625, "logps/rejected": -483.96240234375, "loss": 0.3981, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.790085792541504, "rewards/margins": 0.866573691368103, "rewards/rejected": -2.6566593647003174, "step": 2340 }, { "epoch": 0.5638195777351248, "grad_norm": 12.815153330284657, "learning_rate": 2.379650918963156e-07, "logits/chosen": -1.3083336353302002, "logits/rejected": -1.3296959400177002, "logps/chosen": -408.14068603515625, "logps/rejected": -637.8969116210938, "loss": 0.4107, "rewards/accuracies": 0.875, "rewards/chosen": -2.137500524520874, "rewards/margins": 2.198113203048706, "rewards/rejected": -4.33561372756958, "step": 2350 }, { "epoch": 0.5662188099808061, "grad_norm": 19.62558947066615, "learning_rate": 2.3587412926283438e-07, "logits/chosen": -1.3259559869766235, "logits/rejected": -1.3641198873519897, "logps/chosen": -511.7887268066406, "logps/rejected": -747.3419189453125, "loss": 0.4316, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.02677845954895, "rewards/margins": 2.806626796722412, "rewards/rejected": -4.833405017852783, "step": 2360 }, { "epoch": 0.5686180422264875, "grad_norm": 21.00689689996569, "learning_rate": 2.337841575030642e-07, "logits/chosen": -1.1748692989349365, "logits/rejected": -1.220413327217102, "logps/chosen": -463.1639099121094, "logps/rejected": -683.6637573242188, "loss": 0.3981, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8594157695770264, "rewards/margins": 2.1025776863098145, "rewards/rejected": -3.96199369430542, "step": 2370 }, { "epoch": 0.5710172744721689, "grad_norm": 16.379604246595516, "learning_rate": 2.316953232202206e-07, "logits/chosen": -1.2951033115386963, "logits/rejected": -1.4975831508636475, "logps/chosen": -450.65155029296875, "logps/rejected": -539.8964233398438, "loss": 0.4165, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0984275341033936, "rewards/margins": 1.536707878112793, "rewards/rejected": -3.6351349353790283, "step": 2380 }, { "epoch": 0.5734165067178503, "grad_norm": 13.015590564989903, "learning_rate": 2.2960777293772958e-07, "logits/chosen": -1.2712581157684326, "logits/rejected": -1.3997230529785156, "logps/chosen": -406.917236328125, "logps/rejected": -671.5166625976562, "loss": 0.4366, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.939073920249939, "rewards/margins": 2.8481147289276123, "rewards/rejected": -4.787188529968262, "step": 2390 }, { "epoch": 0.5758157389635317, "grad_norm": 13.201413526903568, "learning_rate": 2.2752165308894974e-07, "logits/chosen": -1.2536894083023071, "logits/rejected": -1.2781312465667725, "logps/chosen": -368.1941833496094, "logps/rejected": -593.6776123046875, "loss": 0.4295, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6818746328353882, "rewards/margins": 2.3582541942596436, "rewards/rejected": -4.040129661560059, "step": 2400 }, { "epoch": 0.5782149712092131, "grad_norm": 12.990669417950757, "learning_rate": 2.254371100069005e-07, "logits/chosen": -1.1515527963638306, "logits/rejected": -1.1078431606292725, "logps/chosen": -375.81732177734375, "logps/rejected": -583.7494506835938, "loss": 0.3986, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.465872049331665, "rewards/margins": 1.888287901878357, "rewards/rejected": -3.3541598320007324, "step": 2410 }, { "epoch": 0.5806142034548945, "grad_norm": 17.818623986164003, "learning_rate": 2.2335428991399725e-07, "logits/chosen": -1.242234468460083, "logits/rejected": -1.2769407033920288, "logps/chosen": -519.7703857421875, "logps/rejected": -948.0362548828125, "loss": 0.403, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.072312116622925, "rewards/margins": 4.26052188873291, "rewards/rejected": -7.332834720611572, "step": 2420 }, { "epoch": 0.5830134357005758, "grad_norm": 14.48204764987916, "learning_rate": 2.2127333891179458e-07, "logits/chosen": -1.3385140895843506, "logits/rejected": -1.3878307342529297, "logps/chosen": -430.9832458496094, "logps/rejected": -745.4675903320312, "loss": 0.4398, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.236307144165039, "rewards/margins": 2.929969310760498, "rewards/rejected": -5.166275978088379, "step": 2430 }, { "epoch": 0.5854126679462572, "grad_norm": 27.82308149849405, "learning_rate": 2.1919440297073782e-07, "logits/chosen": -1.271857500076294, "logits/rejected": -1.3383657932281494, "logps/chosen": -462.377685546875, "logps/rejected": -775.3133544921875, "loss": 0.4672, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.4519903659820557, "rewards/margins": 3.0875110626220703, "rewards/rejected": -5.539501190185547, "step": 2440 }, { "epoch": 0.5878119001919386, "grad_norm": 13.320769149199382, "learning_rate": 2.1711762791992368e-07, "logits/chosen": -1.2566578388214111, "logits/rejected": -1.29204261302948, "logps/chosen": -504.62371826171875, "logps/rejected": -647.8597412109375, "loss": 0.4486, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.186826705932617, "rewards/margins": 1.7466872930526733, "rewards/rejected": -3.9335131645202637, "step": 2450 }, { "epoch": 0.5902111324376199, "grad_norm": 15.89239004487952, "learning_rate": 2.1504315943687114e-07, "logits/chosen": -1.135602355003357, "logits/rejected": -1.110710859298706, "logps/chosen": -410.08441162109375, "logps/rejected": -693.5352783203125, "loss": 0.4104, "rewards/accuracies": 0.875, "rewards/chosen": -1.8059812784194946, "rewards/margins": 2.415012836456299, "rewards/rejected": -4.220993995666504, "step": 2460 }, { "epoch": 0.5926103646833013, "grad_norm": 19.370614206098328, "learning_rate": 2.1297114303730248e-07, "logits/chosen": -1.1299896240234375, "logits/rejected": -1.0449109077453613, "logps/chosen": -411.85040283203125, "logps/rejected": -708.7200927734375, "loss": 0.4912, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.996572494506836, "rewards/margins": 2.44885516166687, "rewards/rejected": -4.445427894592285, "step": 2470 }, { "epoch": 0.5950095969289827, "grad_norm": 16.656657262376168, "learning_rate": 2.1090172406493616e-07, "logits/chosen": -1.0786526203155518, "logits/rejected": -1.0406323671340942, "logps/chosen": -351.46417236328125, "logps/rejected": -575.1397705078125, "loss": 0.3825, "rewards/accuracies": 0.875, "rewards/chosen": -1.2345200777053833, "rewards/margins": 2.170752763748169, "rewards/rejected": -3.4052727222442627, "step": 2480 }, { "epoch": 0.5974088291746641, "grad_norm": 19.934123908947278, "learning_rate": 2.0883504768129146e-07, "logits/chosen": -1.242959976196289, "logits/rejected": -1.2493782043457031, "logps/chosen": -472.5721130371094, "logps/rejected": -716.5457763671875, "loss": 0.431, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.007760763168335, "rewards/margins": 2.496676206588745, "rewards/rejected": -4.504437446594238, "step": 2490 }, { "epoch": 0.5998080614203455, "grad_norm": 15.746281719328787, "learning_rate": 2.0677125885550571e-07, "logits/chosen": -1.1213797330856323, "logits/rejected": -1.3027660846710205, "logps/chosen": -431.63641357421875, "logps/rejected": -593.3057861328125, "loss": 0.4323, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9971039295196533, "rewards/margins": 1.9194910526275635, "rewards/rejected": -3.9165947437286377, "step": 2500 }, { "epoch": 0.6022072936660269, "grad_norm": 23.759422926525374, "learning_rate": 2.0471050235416587e-07, "logits/chosen": -1.0919904708862305, "logits/rejected": -1.2857704162597656, "logps/chosen": -511.79132080078125, "logps/rejected": -708.4951171875, "loss": 0.3853, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.4641222953796387, "rewards/margins": 2.413705348968506, "rewards/rejected": -4.8778276443481445, "step": 2510 }, { "epoch": 0.6046065259117083, "grad_norm": 25.281497673378116, "learning_rate": 2.026529227311532e-07, "logits/chosen": -1.229898452758789, "logits/rejected": -1.2374264001846313, "logps/chosen": -431.0499572753906, "logps/rejected": -689.706298828125, "loss": 0.468, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.1411490440368652, "rewards/margins": 2.5051817893981934, "rewards/rejected": -4.6463303565979, "step": 2520 }, { "epoch": 0.6070057581573897, "grad_norm": 14.693269652927464, "learning_rate": 2.005986643175036e-07, "logits/chosen": -1.187809944152832, "logits/rejected": -1.1687113046646118, "logps/chosen": -414.11407470703125, "logps/rejected": -739.9503173828125, "loss": 0.3685, "rewards/accuracies": 0.875, "rewards/chosen": -1.609575867652893, "rewards/margins": 3.377927780151367, "rewards/rejected": -4.9875030517578125, "step": 2530 }, { "epoch": 0.6094049904030711, "grad_norm": 18.59133617851427, "learning_rate": 1.9854787121128328e-07, "logits/chosen": -1.203471302986145, "logits/rejected": -1.3611973524093628, "logps/chosen": -412.1192321777344, "logps/rejected": -546.4465942382812, "loss": 0.4775, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.861853837966919, "rewards/margins": 1.7769527435302734, "rewards/rejected": -3.6388065814971924, "step": 2540 }, { "epoch": 0.6118042226487524, "grad_norm": 13.920438386133542, "learning_rate": 1.9650068726748106e-07, "logits/chosen": -1.1817299127578735, "logits/rejected": -1.3077366352081299, "logps/chosen": -465.7212829589844, "logps/rejected": -663.9733276367188, "loss": 0.4559, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.955004334449768, "rewards/margins": 2.0206496715545654, "rewards/rejected": -3.975654125213623, "step": 2550 }, { "epoch": 0.6142034548944337, "grad_norm": 13.161703470683147, "learning_rate": 1.9445725608791718e-07, "logits/chosen": -1.1682353019714355, "logits/rejected": -1.2265560626983643, "logps/chosen": -466.87713623046875, "logps/rejected": -877.6561279296875, "loss": 0.4242, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.123385190963745, "rewards/margins": 4.002751350402832, "rewards/rejected": -6.126136779785156, "step": 2560 }, { "epoch": 0.6166026871401151, "grad_norm": 15.36104260148359, "learning_rate": 1.924177210111705e-07, "logits/chosen": -1.2761640548706055, "logits/rejected": -1.3670276403427124, "logps/chosen": -424.89581298828125, "logps/rejected": -750.1470947265625, "loss": 0.4409, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8831450939178467, "rewards/margins": 3.1273887157440186, "rewards/rejected": -5.010534286499023, "step": 2570 }, { "epoch": 0.6190019193857965, "grad_norm": 10.821617135187617, "learning_rate": 1.9038222510252364e-07, "logits/chosen": -1.2255038022994995, "logits/rejected": -1.249638319015503, "logps/chosen": -406.5605773925781, "logps/rejected": -569.8077392578125, "loss": 0.4144, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.5970432758331299, "rewards/margins": 1.7124197483062744, "rewards/rejected": -3.3094630241394043, "step": 2580 }, { "epoch": 0.6214011516314779, "grad_norm": 20.79834881298733, "learning_rate": 1.883509111439277e-07, "logits/chosen": -1.2082470655441284, "logits/rejected": -1.2324997186660767, "logps/chosen": -405.4540710449219, "logps/rejected": -772.509765625, "loss": 0.4118, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.863959550857544, "rewards/margins": 3.0209383964538574, "rewards/rejected": -4.8848981857299805, "step": 2590 }, { "epoch": 0.6238003838771593, "grad_norm": 14.594348842540208, "learning_rate": 1.8632392162398665e-07, "logits/chosen": -1.1460466384887695, "logits/rejected": -1.148008108139038, "logps/chosen": -481.5874938964844, "logps/rejected": -762.729736328125, "loss": 0.3784, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.904550313949585, "rewards/margins": 2.8737847805023193, "rewards/rejected": -4.778334617614746, "step": 2600 }, { "epoch": 0.6261996161228407, "grad_norm": 18.405220413400123, "learning_rate": 1.84301398727962e-07, "logits/chosen": -1.2919515371322632, "logits/rejected": -1.226064682006836, "logps/chosen": -384.24078369140625, "logps/rejected": -750.8485107421875, "loss": 0.4252, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.0034337043762207, "rewards/margins": 3.35615611076355, "rewards/rejected": -5.359589576721191, "step": 2610 }, { "epoch": 0.6285988483685221, "grad_norm": 20.54137342064033, "learning_rate": 1.8228348432779966e-07, "logits/chosen": -1.2917633056640625, "logits/rejected": -1.313946008682251, "logps/chosen": -463.4065856933594, "logps/rejected": -708.0303955078125, "loss": 0.4268, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.310145854949951, "rewards/margins": 2.4632420539855957, "rewards/rejected": -4.773387432098389, "step": 2620 }, { "epoch": 0.6309980806142035, "grad_norm": 12.212262130926057, "learning_rate": 1.8027031997217773e-07, "logits/chosen": -1.3770760297775269, "logits/rejected": -1.3869761228561401, "logps/chosen": -496.34136962890625, "logps/rejected": -1004.2086791992188, "loss": 0.3796, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.659231662750244, "rewards/margins": 4.979175090789795, "rewards/rejected": -7.638407230377197, "step": 2630 }, { "epoch": 0.6333973128598849, "grad_norm": 16.20411120653905, "learning_rate": 1.7826204687657758e-07, "logits/chosen": -1.1182453632354736, "logits/rejected": -1.1282155513763428, "logps/chosen": -475.62335205078125, "logps/rejected": -599.8681030273438, "loss": 0.4083, "rewards/accuracies": 0.75, "rewards/chosen": -1.9853700399398804, "rewards/margins": 1.6344798803329468, "rewards/rejected": -3.619849681854248, "step": 2640 }, { "epoch": 0.6357965451055663, "grad_norm": 22.936528003077683, "learning_rate": 1.762588059133781e-07, "logits/chosen": -1.1725223064422607, "logits/rejected": -1.3193824291229248, "logps/chosen": -507.3565368652344, "logps/rejected": -717.995849609375, "loss": 0.4443, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.1068882942199707, "rewards/margins": 2.464982509613037, "rewards/rejected": -4.57187032699585, "step": 2650 }, { "epoch": 0.6381957773512476, "grad_norm": 18.6077043898828, "learning_rate": 1.7426073760197406e-07, "logits/chosen": -1.1054435968399048, "logits/rejected": -1.0801939964294434, "logps/chosen": -478.20074462890625, "logps/rejected": -859.6195068359375, "loss": 0.4259, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.2413601875305176, "rewards/margins": 3.528954029083252, "rewards/rejected": -5.7703142166137695, "step": 2660 }, { "epoch": 0.6405950095969289, "grad_norm": 14.701850577255247, "learning_rate": 1.7226798209891935e-07, "logits/chosen": -1.1424671411514282, "logits/rejected": -1.3594285249710083, "logps/chosen": -486.80755615234375, "logps/rejected": -663.4140625, "loss": 0.3759, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.2926859855651855, "rewards/margins": 2.290337085723877, "rewards/rejected": -4.583022594451904, "step": 2670 }, { "epoch": 0.6429942418426103, "grad_norm": 23.82001052972649, "learning_rate": 1.7028067918809535e-07, "logits/chosen": -1.220568299293518, "logits/rejected": -1.2531940937042236, "logps/chosen": -426.9730529785156, "logps/rejected": -829.8860473632812, "loss": 0.4311, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0204787254333496, "rewards/margins": 3.661693572998047, "rewards/rejected": -5.6821722984313965, "step": 2680 }, { "epoch": 0.6453934740882917, "grad_norm": 20.92197517678509, "learning_rate": 1.6829896827090584e-07, "logits/chosen": -1.3699915409088135, "logits/rejected": -1.415359377861023, "logps/chosen": -504.123291015625, "logps/rejected": -585.438232421875, "loss": 0.4529, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.390109062194824, "rewards/margins": 1.1103547811508179, "rewards/rejected": -3.5004639625549316, "step": 2690 }, { "epoch": 0.6477927063339731, "grad_norm": 11.90165561763133, "learning_rate": 1.6632298835649844e-07, "logits/chosen": -1.2418677806854248, "logits/rejected": -1.2179887294769287, "logps/chosen": -498.97607421875, "logps/rejected": -764.77587890625, "loss": 0.3979, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.3297247886657715, "rewards/margins": 2.4172799587249756, "rewards/rejected": -4.747004508972168, "step": 2700 }, { "epoch": 0.6501919385796545, "grad_norm": 13.166436756002133, "learning_rate": 1.6435287805201364e-07, "logits/chosen": -1.3559385538101196, "logits/rejected": -1.3355977535247803, "logps/chosen": -489.180419921875, "logps/rejected": -652.1635131835938, "loss": 0.4155, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2472357749938965, "rewards/margins": 1.647270917892456, "rewards/rejected": -3.8945069313049316, "step": 2710 }, { "epoch": 0.6525911708253359, "grad_norm": 19.18769450791739, "learning_rate": 1.6238877555286207e-07, "logits/chosen": -1.3151136636734009, "logits/rejected": -1.3525760173797607, "logps/chosen": -442.0126953125, "logps/rejected": -712.2786865234375, "loss": 0.3659, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7712196111679077, "rewards/margins": 2.6433427333831787, "rewards/rejected": -4.414562702178955, "step": 2720 }, { "epoch": 0.6549904030710173, "grad_norm": 16.006816349741847, "learning_rate": 1.60430818633031e-07, "logits/chosen": -1.1541482210159302, "logits/rejected": -1.1810917854309082, "logps/chosen": -428.27117919921875, "logps/rejected": -661.9808959960938, "loss": 0.3728, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8128198385238647, "rewards/margins": 2.3960225582122803, "rewards/rejected": -4.2088422775268555, "step": 2730 }, { "epoch": 0.6573896353166987, "grad_norm": 15.843743360837514, "learning_rate": 1.5847914463541939e-07, "logits/chosen": -1.2659448385238647, "logits/rejected": -1.3333221673965454, "logps/chosen": -380.4425354003906, "logps/rejected": -661.7493896484375, "loss": 0.3829, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8144347667694092, "rewards/margins": 2.581631898880005, "rewards/rejected": -4.396066665649414, "step": 2740 }, { "epoch": 0.6597888675623801, "grad_norm": 12.902781532875105, "learning_rate": 1.5653389046220427e-07, "logits/chosen": -1.202580451965332, "logits/rejected": -1.2428548336029053, "logps/chosen": -393.21392822265625, "logps/rejected": -585.6798095703125, "loss": 0.4472, "rewards/accuracies": 0.75, "rewards/chosen": -1.5873596668243408, "rewards/margins": 1.7750890254974365, "rewards/rejected": -3.3624484539031982, "step": 2750 }, { "epoch": 0.6621880998080614, "grad_norm": 15.497438468614629, "learning_rate": 1.545951925652375e-07, "logits/chosen": -1.1852418184280396, "logits/rejected": -1.3214651346206665, "logps/chosen": -513.821044921875, "logps/rejected": -740.9768676757812, "loss": 0.4159, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.0691330432891846, "rewards/margins": 2.799996852874756, "rewards/rejected": -4.8691301345825195, "step": 2760 }, { "epoch": 0.6645873320537428, "grad_norm": 25.023616546810636, "learning_rate": 1.5266318693647423e-07, "logits/chosen": -1.2369322776794434, "logits/rejected": -1.2678784132003784, "logps/chosen": -495.04425048828125, "logps/rejected": -637.2513427734375, "loss": 0.4159, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1835405826568604, "rewards/margins": 1.4997859001159668, "rewards/rejected": -3.6833267211914062, "step": 2770 }, { "epoch": 0.6669865642994242, "grad_norm": 13.945033201288798, "learning_rate": 1.5073800909843353e-07, "logits/chosen": -1.1865065097808838, "logits/rejected": -1.3370563983917236, "logps/chosen": -465.9248046875, "logps/rejected": -663.2384033203125, "loss": 0.4064, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0080044269561768, "rewards/margins": 2.541308879852295, "rewards/rejected": -4.549313545227051, "step": 2780 }, { "epoch": 0.6693857965451055, "grad_norm": 16.335549570588938, "learning_rate": 1.488197940946922e-07, "logits/chosen": -1.090932846069336, "logits/rejected": -1.1426491737365723, "logps/chosen": -454.6915588378906, "logps/rejected": -645.7820434570312, "loss": 0.3909, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7976535558700562, "rewards/margins": 2.447206735610962, "rewards/rejected": -4.244860649108887, "step": 2790 }, { "epoch": 0.6717850287907869, "grad_norm": 19.689368835083336, "learning_rate": 1.4690867648041167e-07, "logits/chosen": -1.0205479860305786, "logits/rejected": -1.2015823125839233, "logps/chosen": -444.68115234375, "logps/rejected": -664.7821044921875, "loss": 0.4133, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8803367614746094, "rewards/margins": 2.4233222007751465, "rewards/rejected": -4.303658485412598, "step": 2800 }, { "epoch": 0.6741842610364683, "grad_norm": 15.922653221989023, "learning_rate": 1.4500479031289987e-07, "logits/chosen": -1.121628999710083, "logits/rejected": -1.2732969522476196, "logps/chosen": -426.40631103515625, "logps/rejected": -628.9437255859375, "loss": 0.4774, "rewards/accuracies": 0.875, "rewards/chosen": -1.6420685052871704, "rewards/margins": 2.0467495918273926, "rewards/rejected": -3.6888179779052734, "step": 2810 }, { "epoch": 0.6765834932821497, "grad_norm": 10.688957500109868, "learning_rate": 1.4310826914220747e-07, "logits/chosen": -1.1144278049468994, "logits/rejected": -1.2004985809326172, "logps/chosen": -465.1630859375, "logps/rejected": -635.3046264648438, "loss": 0.4528, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7992807626724243, "rewards/margins": 1.7720779180526733, "rewards/rejected": -3.5713589191436768, "step": 2820 }, { "epoch": 0.6789827255278311, "grad_norm": 11.408686793770782, "learning_rate": 1.412192460017597e-07, "logits/chosen": -1.2038311958312988, "logits/rejected": -1.162626028060913, "logps/chosen": -476.5159606933594, "logps/rejected": -705.89990234375, "loss": 0.4191, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.387838840484619, "rewards/margins": 2.2411282062530518, "rewards/rejected": -4.628966331481934, "step": 2830 }, { "epoch": 0.6813819577735125, "grad_norm": 12.118691511541517, "learning_rate": 1.3933785339902504e-07, "logits/chosen": -1.2565038204193115, "logits/rejected": -1.167474389076233, "logps/chosen": -400.8456115722656, "logps/rejected": -641.7380981445312, "loss": 0.4456, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9326709508895874, "rewards/margins": 1.984291672706604, "rewards/rejected": -3.9169623851776123, "step": 2840 }, { "epoch": 0.6837811900191939, "grad_norm": 13.932235400427288, "learning_rate": 1.374642233062197e-07, "logits/chosen": -1.1590187549591064, "logits/rejected": -1.3027136325836182, "logps/chosen": -474.519287109375, "logps/rejected": -681.8683471679688, "loss": 0.4318, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.970078468322754, "rewards/margins": 2.387824535369873, "rewards/rejected": -4.357902526855469, "step": 2850 }, { "epoch": 0.6861804222648752, "grad_norm": 16.723737190217008, "learning_rate": 1.355984871510511e-07, "logits/chosen": -1.1410120725631714, "logits/rejected": -1.1267549991607666, "logps/chosen": -505.823974609375, "logps/rejected": -715.5396118164062, "loss": 0.3923, "rewards/accuracies": 0.875, "rewards/chosen": -2.1921565532684326, "rewards/margins": 2.031604290008545, "rewards/rejected": -4.223761081695557, "step": 2860 }, { "epoch": 0.6885796545105566, "grad_norm": 21.50806215898013, "learning_rate": 1.3374077580749783e-07, "logits/chosen": -1.3120447397232056, "logits/rejected": -1.281110405921936, "logps/chosen": -368.84185791015625, "logps/rejected": -601.4373168945312, "loss": 0.4163, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7995599508285522, "rewards/margins": 2.158607006072998, "rewards/rejected": -3.958167314529419, "step": 2870 }, { "epoch": 0.690978886756238, "grad_norm": 27.638375107613143, "learning_rate": 1.3189121958663024e-07, "logits/chosen": -1.1347416639328003, "logits/rejected": -1.3342390060424805, "logps/chosen": -551.9705810546875, "logps/rejected": -637.0390625, "loss": 0.4529, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7998149394989014, "rewards/margins": 1.0880420207977295, "rewards/rejected": -3.8878567218780518, "step": 2880 }, { "epoch": 0.6933781190019194, "grad_norm": 14.692498350185678, "learning_rate": 1.3004994822746895e-07, "logits/chosen": -1.2893893718719482, "logits/rejected": -1.3418468236923218, "logps/chosen": -442.1363220214844, "logps/rejected": -626.1585083007812, "loss": 0.4302, "rewards/accuracies": 0.75, "rewards/chosen": -1.9635156393051147, "rewards/margins": 1.7742655277252197, "rewards/rejected": -3.737781047821045, "step": 2890 }, { "epoch": 0.6957773512476008, "grad_norm": 13.677081786503933, "learning_rate": 1.2821709088788434e-07, "logits/chosen": -1.0876823663711548, "logits/rejected": -1.1584521532058716, "logps/chosen": -408.7326965332031, "logps/rejected": -630.5333862304688, "loss": 0.4069, "rewards/accuracies": 0.875, "rewards/chosen": -2.062255620956421, "rewards/margins": 2.1946072578430176, "rewards/rejected": -4.256862163543701, "step": 2900 }, { "epoch": 0.6981765834932822, "grad_norm": 15.872787924761283, "learning_rate": 1.2639277613553736e-07, "logits/chosen": -1.3518760204315186, "logits/rejected": -1.3280441761016846, "logps/chosen": -398.5044860839844, "logps/rejected": -600.9732666015625, "loss": 0.4195, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9100825786590576, "rewards/margins": 2.0270023345947266, "rewards/rejected": -3.9370853900909424, "step": 2910 }, { "epoch": 0.7005758157389635, "grad_norm": 13.346767559623201, "learning_rate": 1.2457713193885975e-07, "logits/chosen": -1.1873770952224731, "logits/rejected": -1.1855942010879517, "logps/chosen": -413.6412048339844, "logps/rejected": -724.1486206054688, "loss": 0.3688, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.3553316593170166, "rewards/margins": 2.758777379989624, "rewards/rejected": -5.114109516143799, "step": 2920 }, { "epoch": 0.7029750479846449, "grad_norm": 22.448762027485316, "learning_rate": 1.2277028565807838e-07, "logits/chosen": -1.2834995985031128, "logits/rejected": -1.3559261560440063, "logps/chosen": -453.78985595703125, "logps/rejected": -671.7575073242188, "loss": 0.4267, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.049879550933838, "rewards/margins": 2.2892394065856934, "rewards/rejected": -4.339118957519531, "step": 2930 }, { "epoch": 0.7053742802303263, "grad_norm": 16.129648043941792, "learning_rate": 1.209723640362815e-07, "logits/chosen": -1.1959068775177002, "logits/rejected": -1.2405879497528076, "logps/chosen": -505.00726318359375, "logps/rejected": -860.0382690429688, "loss": 0.4607, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.3982629776000977, "rewards/margins": 3.6283717155456543, "rewards/rejected": -6.026634693145752, "step": 2940 }, { "epoch": 0.7077735124760077, "grad_norm": 14.433857430526624, "learning_rate": 1.191834931905277e-07, "logits/chosen": -1.1443761587142944, "logits/rejected": -1.1771003007888794, "logps/chosen": -516.5400390625, "logps/rejected": -709.7727661132812, "loss": 0.4379, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.209808826446533, "rewards/margins": 1.9394118785858154, "rewards/rejected": -4.1492204666137695, "step": 2950 }, { "epoch": 0.710172744721689, "grad_norm": 14.063698311506704, "learning_rate": 1.1740379860299988e-07, "logits/chosen": -1.1812469959259033, "logits/rejected": -1.1932779550552368, "logps/chosen": -486.75848388671875, "logps/rejected": -688.697509765625, "loss": 0.4358, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0185728073120117, "rewards/margins": 1.8188155889511108, "rewards/rejected": -3.837387800216675, "step": 2960 }, { "epoch": 0.7125719769673704, "grad_norm": 12.25394441206106, "learning_rate": 1.1563340511220254e-07, "logits/chosen": -1.1238670349121094, "logits/rejected": -1.2288951873779297, "logps/chosen": -479.03082275390625, "logps/rejected": -721.0484008789062, "loss": 0.4231, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9672508239746094, "rewards/margins": 2.5986568927764893, "rewards/rejected": -4.5659074783325195, "step": 2970 }, { "epoch": 0.7149712092130518, "grad_norm": 14.233686065176078, "learning_rate": 1.1387243690420556e-07, "logits/chosen": -1.1306841373443604, "logits/rejected": -1.2058961391448975, "logps/chosen": -530.5966796875, "logps/rejected": -791.8387451171875, "loss": 0.4661, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.097524881362915, "rewards/margins": 2.7319130897521973, "rewards/rejected": -4.829438209533691, "step": 2980 }, { "epoch": 0.7173704414587332, "grad_norm": 19.693131800311516, "learning_rate": 1.1212101750393235e-07, "logits/chosen": -1.255438208580017, "logits/rejected": -1.3434498310089111, "logps/chosen": -474.38916015625, "logps/rejected": -754.1934814453125, "loss": 0.4109, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.2069497108459473, "rewards/margins": 2.954502582550049, "rewards/rejected": -5.161452293395996, "step": 2990 }, { "epoch": 0.7197696737044146, "grad_norm": 16.97782338199475, "learning_rate": 1.1037926976649562e-07, "logits/chosen": -1.1937129497528076, "logits/rejected": -1.2495759725570679, "logps/chosen": -488.2049865722656, "logps/rejected": -799.8907470703125, "loss": 0.4527, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.2300727367401123, "rewards/margins": 2.9182958602905273, "rewards/rejected": -5.148368835449219, "step": 3000 }, { "epoch": 0.722168905950096, "grad_norm": 18.393491636386905, "learning_rate": 1.0864731586857936e-07, "logits/chosen": -1.1326544284820557, "logits/rejected": -1.2836530208587646, "logps/chosen": -472.63739013671875, "logps/rejected": -724.0669555664062, "loss": 0.3849, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8869655132293701, "rewards/margins": 2.8039302825927734, "rewards/rejected": -4.6908955574035645, "step": 3010 }, { "epoch": 0.7245681381957774, "grad_norm": 20.305303573951132, "learning_rate": 1.0692527729986839e-07, "logits/chosen": -1.117078423500061, "logits/rejected": -1.2159783840179443, "logps/chosen": -495.0431213378906, "logps/rejected": -729.7164306640625, "loss": 0.3795, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.3929858207702637, "rewards/margins": 2.559861660003662, "rewards/rejected": -4.952847003936768, "step": 3020 }, { "epoch": 0.7269673704414588, "grad_norm": 18.149863459319363, "learning_rate": 1.0521327485452692e-07, "logits/chosen": -1.2258936166763306, "logits/rejected": -1.2951385974884033, "logps/chosen": -507.068359375, "logps/rejected": -806.07421875, "loss": 0.4145, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.737776279449463, "rewards/margins": 3.1096084117889404, "rewards/rejected": -5.847384452819824, "step": 3030 }, { "epoch": 0.7293666026871402, "grad_norm": 22.723618390263812, "learning_rate": 1.0351142862272468e-07, "logits/chosen": -1.1216206550598145, "logits/rejected": -1.2867326736450195, "logps/chosen": -473.12408447265625, "logps/rejected": -896.2477416992188, "loss": 0.4208, "rewards/accuracies": 0.875, "rewards/chosen": -2.6412878036499023, "rewards/margins": 4.341336727142334, "rewards/rejected": -6.982624053955078, "step": 3040 }, { "epoch": 0.7317658349328215, "grad_norm": 19.527492956725595, "learning_rate": 1.0181985798221343e-07, "logits/chosen": -1.1790878772735596, "logits/rejected": -1.1905752420425415, "logps/chosen": -470.6045837402344, "logps/rejected": -782.1622924804688, "loss": 0.4267, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.268146514892578, "rewards/margins": 3.100985527038574, "rewards/rejected": -5.369132041931152, "step": 3050 }, { "epoch": 0.7341650671785028, "grad_norm": 17.486447949626083, "learning_rate": 1.0013868158995329e-07, "logits/chosen": -1.1854205131530762, "logits/rejected": -1.268317461013794, "logps/chosen": -508.628173828125, "logps/rejected": -689.4181518554688, "loss": 0.4241, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.384272575378418, "rewards/margins": 2.0019237995147705, "rewards/rejected": -4.386197090148926, "step": 3060 }, { "epoch": 0.7365642994241842, "grad_norm": 11.876302215183998, "learning_rate": 9.84680173737887e-08, "logits/chosen": -1.2807575464248657, "logits/rejected": -1.3796226978302002, "logps/chosen": -456.07244873046875, "logps/rejected": -617.9744873046875, "loss": 0.4315, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.917088508605957, "rewards/margins": 2.016084909439087, "rewards/rejected": -3.933173418045044, "step": 3070 }, { "epoch": 0.7389635316698656, "grad_norm": 13.157616210751735, "learning_rate": 9.680798252417713e-08, "logits/chosen": -1.3769404888153076, "logits/rejected": -1.4445879459381104, "logps/chosen": -420.9883728027344, "logps/rejected": -629.974853515625, "loss": 0.4059, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0882909297943115, "rewards/margins": 1.72428297996521, "rewards/rejected": -3.8125743865966797, "step": 3080 }, { "epoch": 0.741362763915547, "grad_norm": 15.730225571388548, "learning_rate": 9.515869348596808e-08, "logits/chosen": -1.1365947723388672, "logits/rejected": -1.275075912475586, "logps/chosen": -472.18408203125, "logps/rejected": -672.9304809570312, "loss": 0.4284, "rewards/accuracies": 0.875, "rewards/chosen": -1.8900762796401978, "rewards/margins": 2.209829807281494, "rewards/rejected": -4.099905967712402, "step": 3090 }, { "epoch": 0.7437619961612284, "grad_norm": 35.145335130920884, "learning_rate": 9.352026595023493e-08, "logits/chosen": -1.1994316577911377, "logits/rejected": -1.2499196529388428, "logps/chosen": -472.410888671875, "logps/rejected": -594.9708251953125, "loss": 0.4289, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9588143825531006, "rewards/margins": 1.4960181713104248, "rewards/rejected": -3.4548325538635254, "step": 3100 }, { "epoch": 0.7461612284069098, "grad_norm": 15.811033639076967, "learning_rate": 9.189281484616004e-08, "logits/chosen": -1.2232351303100586, "logits/rejected": -1.2345741987228394, "logps/chosen": -410.38848876953125, "logps/rejected": -693.65673828125, "loss": 0.4377, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.075925588607788, "rewards/margins": 2.4454538822174072, "rewards/rejected": -4.521379470825195, "step": 3110 }, { "epoch": 0.7485604606525912, "grad_norm": 22.73378264594113, "learning_rate": 9.027645433297249e-08, "logits/chosen": -1.103428602218628, "logits/rejected": -1.1875020265579224, "logps/chosen": -571.9773559570312, "logps/rejected": -778.9340209960938, "loss": 0.4763, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.662141799926758, "rewards/margins": 2.4210867881774902, "rewards/rejected": -5.083228588104248, "step": 3120 }, { "epoch": 0.7509596928982726, "grad_norm": 25.696268435535774, "learning_rate": 8.867129779194066e-08, "logits/chosen": -1.244616150856018, "logits/rejected": -1.353212594985962, "logps/chosen": -394.8204040527344, "logps/rejected": -692.91357421875, "loss": 0.4442, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8012546300888062, "rewards/margins": 2.963744640350342, "rewards/rejected": -4.764999866485596, "step": 3130 }, { "epoch": 0.753358925143954, "grad_norm": 18.050945946975368, "learning_rate": 8.707745781841866e-08, "logits/chosen": -1.1310231685638428, "logits/rejected": -1.2564256191253662, "logps/chosen": -449.65362548828125, "logps/rejected": -771.5709838867188, "loss": 0.4261, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.2246077060699463, "rewards/margins": 3.2700328826904297, "rewards/rejected": -5.494640350341797, "step": 3140 }, { "epoch": 0.7557581573896354, "grad_norm": 9.799407812333103, "learning_rate": 8.549504621394831e-08, "logits/chosen": -1.2643756866455078, "logits/rejected": -1.2907123565673828, "logps/chosen": -408.44683837890625, "logps/rejected": -748.5787353515625, "loss": 0.3526, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.775541067123413, "rewards/margins": 3.4335663318634033, "rewards/rejected": -5.209107398986816, "step": 3150 }, { "epoch": 0.7581573896353166, "grad_norm": 19.977109925350412, "learning_rate": 8.392417397841703e-08, "logits/chosen": -1.216590166091919, "logits/rejected": -1.3006147146224976, "logps/chosen": -467.8597717285156, "logps/rejected": -650.8781127929688, "loss": 0.4394, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0549581050872803, "rewards/margins": 1.685307502746582, "rewards/rejected": -3.7402656078338623, "step": 3160 }, { "epoch": 0.760556621880998, "grad_norm": 17.465911639708498, "learning_rate": 8.236495130227083e-08, "logits/chosen": -1.2152674198150635, "logits/rejected": -1.3853117227554321, "logps/chosen": -531.36865234375, "logps/rejected": -838.9884033203125, "loss": 0.4623, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.4217429161071777, "rewards/margins": 3.4808337688446045, "rewards/rejected": -5.9025774002075195, "step": 3170 }, { "epoch": 0.7629558541266794, "grad_norm": 18.484754479900506, "learning_rate": 8.081748755878612e-08, "logits/chosen": -1.2535191774368286, "logits/rejected": -1.3943572044372559, "logps/chosen": -503.76898193359375, "logps/rejected": -699.9238891601562, "loss": 0.4074, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.2457404136657715, "rewards/margins": 2.4922969341278076, "rewards/rejected": -4.738037109375, "step": 3180 }, { "epoch": 0.7653550863723608, "grad_norm": 13.480118265934307, "learning_rate": 7.928189129639632e-08, "logits/chosen": -1.1623866558074951, "logits/rejected": -1.1990493535995483, "logps/chosen": -440.8330993652344, "logps/rejected": -671.6934814453125, "loss": 0.4079, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.2158052921295166, "rewards/margins": 2.1327455043792725, "rewards/rejected": -4.348550796508789, "step": 3190 }, { "epoch": 0.7677543186180422, "grad_norm": 18.218781531194935, "learning_rate": 7.775827023107834e-08, "logits/chosen": -1.2353599071502686, "logits/rejected": -1.2978723049163818, "logps/chosen": -485.01837158203125, "logps/rejected": -695.3224487304688, "loss": 0.4193, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.558899164199829, "rewards/margins": 1.8993895053863525, "rewards/rejected": -4.458288669586182, "step": 3200 }, { "epoch": 0.7701535508637236, "grad_norm": 23.87409903985215, "learning_rate": 7.624673123879682e-08, "logits/chosen": -1.1395564079284668, "logits/rejected": -1.2785004377365112, "logps/chosen": -447.79144287109375, "logps/rejected": -616.0286254882812, "loss": 0.435, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.080073356628418, "rewards/margins": 1.8376333713531494, "rewards/rejected": -3.917706251144409, "step": 3210 }, { "epoch": 0.772552783109405, "grad_norm": 20.86151843851289, "learning_rate": 7.474738034800663e-08, "logits/chosen": -1.2566897869110107, "logits/rejected": -1.2571423053741455, "logps/chosen": -419.4219665527344, "logps/rejected": -851.1385498046875, "loss": 0.4659, "rewards/accuracies": 0.75, "rewards/chosen": -2.008873701095581, "rewards/margins": 4.522359371185303, "rewards/rejected": -6.5312323570251465, "step": 3220 }, { "epoch": 0.7749520153550864, "grad_norm": 13.169040672352676, "learning_rate": 7.326032273221606e-08, "logits/chosen": -1.3603243827819824, "logits/rejected": -1.3747715950012207, "logps/chosen": -500.2078552246094, "logps/rejected": -785.7582397460938, "loss": 0.4021, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.25557279586792, "rewards/margins": 3.053633213043213, "rewards/rejected": -5.309205532073975, "step": 3230 }, { "epoch": 0.7773512476007678, "grad_norm": 16.05314488934455, "learning_rate": 7.178566270260872e-08, "logits/chosen": -1.318904995918274, "logits/rejected": -1.3817777633666992, "logps/chosen": -521.3937377929688, "logps/rejected": -800.31103515625, "loss": 0.4506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.6105079650878906, "rewards/margins": 2.589186668395996, "rewards/rejected": -5.199694633483887, "step": 3240 }, { "epoch": 0.7797504798464492, "grad_norm": 14.261695457548557, "learning_rate": 7.032350370072709e-08, "logits/chosen": -1.2088521718978882, "logits/rejected": -1.2967567443847656, "logps/chosen": -463.45794677734375, "logps/rejected": -668.6795654296875, "loss": 0.4028, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.936281442642212, "rewards/margins": 2.147418975830078, "rewards/rejected": -4.083700180053711, "step": 3250 }, { "epoch": 0.7821497120921305, "grad_norm": 12.79782870680645, "learning_rate": 6.887394829121596e-08, "logits/chosen": -1.2623844146728516, "logits/rejected": -1.3965818881988525, "logps/chosen": -520.1638793945312, "logps/rejected": -901.8109130859375, "loss": 0.3906, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.5400707721710205, "rewards/margins": 4.0481061935424805, "rewards/rejected": -6.588177680969238, "step": 3260 }, { "epoch": 0.7845489443378119, "grad_norm": 16.59225411377178, "learning_rate": 6.743709815462833e-08, "logits/chosen": -1.2642148733139038, "logits/rejected": -1.3561595678329468, "logps/chosen": -521.087890625, "logps/rejected": -813.130615234375, "loss": 0.4099, "rewards/accuracies": 0.75, "rewards/chosen": -2.508223056793213, "rewards/margins": 3.2700836658477783, "rewards/rejected": -5.7783074378967285, "step": 3270 }, { "epoch": 0.7869481765834933, "grad_norm": 14.993047424273575, "learning_rate": 6.601305408029287e-08, "logits/chosen": -1.2957048416137695, "logits/rejected": -1.418428659439087, "logps/chosen": -458.10076904296875, "logps/rejected": -786.9931640625, "loss": 0.3831, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.213374137878418, "rewards/margins": 3.280181407928467, "rewards/rejected": -5.493556022644043, "step": 3280 }, { "epoch": 0.7893474088291746, "grad_norm": 15.085926001703353, "learning_rate": 6.460191595924366e-08, "logits/chosen": -1.2087162733078003, "logits/rejected": -1.2654608488082886, "logps/chosen": -449.6219177246094, "logps/rejected": -705.5459594726562, "loss": 0.385, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.1014938354492188, "rewards/margins": 2.5265586376190186, "rewards/rejected": -4.628052711486816, "step": 3290 }, { "epoch": 0.791746641074856, "grad_norm": 13.169922978355546, "learning_rate": 6.320378277721342e-08, "logits/chosen": -1.3274714946746826, "logits/rejected": -1.3772881031036377, "logps/chosen": -465.1910095214844, "logps/rejected": -624.9302368164062, "loss": 0.4013, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.1250336170196533, "rewards/margins": 1.715985894203186, "rewards/rejected": -3.841019868850708, "step": 3300 }, { "epoch": 0.7941458733205374, "grad_norm": 27.82691688189915, "learning_rate": 6.181875260769032e-08, "logits/chosen": -1.2468100786209106, "logits/rejected": -1.4290482997894287, "logps/chosen": -521.2555541992188, "logps/rejected": -781.6026000976562, "loss": 0.416, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.3911871910095215, "rewards/margins": 3.2505409717559814, "rewards/rejected": -5.641728401184082, "step": 3310 }, { "epoch": 0.7965451055662188, "grad_norm": 15.737223970644676, "learning_rate": 6.044692260503797e-08, "logits/chosen": -1.1637942790985107, "logits/rejected": -1.2922910451889038, "logps/chosen": -529.7451171875, "logps/rejected": -853.4421997070312, "loss": 0.3675, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.4741718769073486, "rewards/margins": 3.4323413372039795, "rewards/rejected": -5.906513214111328, "step": 3320 }, { "epoch": 0.7989443378119002, "grad_norm": 13.061674320090848, "learning_rate": 5.9088388997680984e-08, "logits/chosen": -1.1912027597427368, "logits/rejected": -1.3324553966522217, "logps/chosen": -545.3333740234375, "logps/rejected": -860.1539306640625, "loss": 0.394, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.197711944580078, "rewards/margins": 3.962009906768799, "rewards/rejected": -6.159722328186035, "step": 3330 }, { "epoch": 0.8013435700575816, "grad_norm": 19.85934874401157, "learning_rate": 5.774324708135439e-08, "logits/chosen": -1.3464608192443848, "logits/rejected": -1.4455270767211914, "logps/chosen": -404.1613464355469, "logps/rejected": -649.46044921875, "loss": 0.4296, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9079539775848389, "rewards/margins": 2.655972480773926, "rewards/rejected": -4.563926696777344, "step": 3340 }, { "epoch": 0.803742802303263, "grad_norm": 10.622344250001166, "learning_rate": 5.641159121241953e-08, "logits/chosen": -1.340012788772583, "logits/rejected": -1.3097373247146606, "logps/chosen": -495.234375, "logps/rejected": -826.9255981445312, "loss": 0.4075, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.605710744857788, "rewards/margins": 3.028745174407959, "rewards/rejected": -5.634456157684326, "step": 3350 }, { "epoch": 0.8061420345489443, "grad_norm": 15.132572988644833, "learning_rate": 5.5093514801245106e-08, "logits/chosen": -1.2543226480484009, "logits/rejected": -1.3029847145080566, "logps/chosen": -481.427734375, "logps/rejected": -770.0972900390625, "loss": 0.4025, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.3580222129821777, "rewards/margins": 2.690171241760254, "rewards/rejected": -5.04819393157959, "step": 3360 }, { "epoch": 0.8085412667946257, "grad_norm": 14.038257828539548, "learning_rate": 5.378911030565453e-08, "logits/chosen": -1.1603384017944336, "logits/rejected": -1.213220238685608, "logps/chosen": -554.8784790039062, "logps/rejected": -800.203369140625, "loss": 0.4264, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.7545363903045654, "rewards/margins": 2.177790403366089, "rewards/rejected": -4.932326793670654, "step": 3370 }, { "epoch": 0.8109404990403071, "grad_norm": 14.786725967482855, "learning_rate": 5.249846922444101e-08, "logits/chosen": -1.3400354385375977, "logits/rejected": -1.4116084575653076, "logps/chosen": -469.63165283203125, "logps/rejected": -946.8123779296875, "loss": 0.3953, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.487299680709839, "rewards/margins": 4.986563682556152, "rewards/rejected": -7.473863124847412, "step": 3380 }, { "epoch": 0.8133397312859885, "grad_norm": 21.495135238551697, "learning_rate": 5.122168209094865e-08, "logits/chosen": -1.2585008144378662, "logits/rejected": -1.3433778285980225, "logps/chosen": -421.2335510253906, "logps/rejected": -548.8929443359375, "loss": 0.4028, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.097749948501587, "rewards/margins": 1.2572873830795288, "rewards/rejected": -3.355037212371826, "step": 3390 }, { "epoch": 0.8157389635316699, "grad_norm": 15.584784008274491, "learning_rate": 4.995883846672222e-08, "logits/chosen": -1.1034057140350342, "logits/rejected": -1.2848924398422241, "logps/chosen": -590.9141845703125, "logps/rejected": -737.7780151367188, "loss": 0.4146, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.305182933807373, "rewards/margins": 2.196429491043091, "rewards/rejected": -4.501612663269043, "step": 3400 }, { "epoch": 0.8181381957773513, "grad_norm": 12.577138913457466, "learning_rate": 4.871002693522486e-08, "logits/chosen": -1.2216746807098389, "logits/rejected": -1.2335078716278076, "logps/chosen": -490.0227966308594, "logps/rejected": -685.375732421875, "loss": 0.4083, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.2040867805480957, "rewards/margins": 2.2901692390441895, "rewards/rejected": -4.494256496429443, "step": 3410 }, { "epoch": 0.8205374280230326, "grad_norm": 12.780423712977628, "learning_rate": 4.7475335095623956e-08, "logits/chosen": -1.2985970973968506, "logits/rejected": -1.2781016826629639, "logps/chosen": -527.1070556640625, "logps/rejected": -797.3029174804688, "loss": 0.4308, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7410807609558105, "rewards/margins": 2.8199234008789062, "rewards/rejected": -5.561004638671875, "step": 3420 }, { "epoch": 0.822936660268714, "grad_norm": 25.453128411840094, "learning_rate": 4.6254849556646714e-08, "logits/chosen": -1.1118319034576416, "logits/rejected": -1.207648515701294, "logps/chosen": -549.860595703125, "logps/rejected": -970.7677612304688, "loss": 0.4555, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.535156488418579, "rewards/margins": 4.580714702606201, "rewards/rejected": -7.115871429443359, "step": 3430 }, { "epoch": 0.8253358925143954, "grad_norm": 14.697372336337441, "learning_rate": 4.504865593050483e-08, "logits/chosen": -1.2074692249298096, "logits/rejected": -1.22970449924469, "logps/chosen": -493.60443115234375, "logps/rejected": -733.1605224609375, "loss": 0.4486, "rewards/accuracies": 0.75, "rewards/chosen": -2.2459022998809814, "rewards/margins": 2.387206792831421, "rewards/rejected": -4.633109092712402, "step": 3440 }, { "epoch": 0.8277351247600768, "grad_norm": 19.145579102229355, "learning_rate": 4.385683882688895e-08, "logits/chosen": -1.0796090364456177, "logits/rejected": -1.2019519805908203, "logps/chosen": -480.3756408691406, "logps/rejected": -558.4412841796875, "loss": 0.4614, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0972506999969482, "rewards/margins": 1.18562912940979, "rewards/rejected": -3.282879590988159, "step": 3450 }, { "epoch": 0.8301343570057581, "grad_norm": 17.204779447658726, "learning_rate": 4.2679481847033985e-08, "logits/chosen": -1.2189116477966309, "logits/rejected": -1.2864820957183838, "logps/chosen": -446.82867431640625, "logps/rejected": -702.0382080078125, "loss": 0.4466, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8683035373687744, "rewards/margins": 2.547211170196533, "rewards/rejected": -4.415513515472412, "step": 3460 }, { "epoch": 0.8325335892514395, "grad_norm": 14.516749938972369, "learning_rate": 4.151666757785435e-08, "logits/chosen": -1.1446878910064697, "logits/rejected": -1.2078421115875244, "logps/chosen": -391.42791748046875, "logps/rejected": -725.2740478515625, "loss": 0.397, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.532698631286621, "rewards/margins": 3.396210193634033, "rewards/rejected": -4.928908348083496, "step": 3470 }, { "epoch": 0.8349328214971209, "grad_norm": 13.196683844151892, "learning_rate": 4.036847758615136e-08, "logits/chosen": -1.091759443283081, "logits/rejected": -1.2269001007080078, "logps/chosen": -532.867431640625, "logps/rejected": -730.4796142578125, "loss": 0.4464, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.9313502311706543, "rewards/margins": 1.9276530742645264, "rewards/rejected": -4.859004020690918, "step": 3480 }, { "epoch": 0.8373320537428023, "grad_norm": 12.89862131476654, "learning_rate": 3.923499241289113e-08, "logits/chosen": -1.1648555994033813, "logits/rejected": -1.3147923946380615, "logps/chosen": -521.7337646484375, "logps/rejected": -674.8948364257812, "loss": 0.4302, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.260921001434326, "rewards/margins": 2.081796646118164, "rewards/rejected": -4.34271764755249, "step": 3490 }, { "epoch": 0.8397312859884837, "grad_norm": 14.173962004603878, "learning_rate": 3.811629156755541e-08, "logits/chosen": -1.160355567932129, "logits/rejected": -1.185987949371338, "logps/chosen": -477.21728515625, "logps/rejected": -679.4395751953125, "loss": 0.4274, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9270728826522827, "rewards/margins": 2.1360950469970703, "rewards/rejected": -4.063167572021484, "step": 3500 }, { "epoch": 0.8421305182341651, "grad_norm": 11.942462848386326, "learning_rate": 3.701245352256391e-08, "logits/chosen": -1.2002038955688477, "logits/rejected": -1.324733018875122, "logps/chosen": -478.535400390625, "logps/rejected": -557.5496826171875, "loss": 0.4283, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8901561498641968, "rewards/margins": 1.0154350996017456, "rewards/rejected": -2.9055914878845215, "step": 3510 }, { "epoch": 0.8445297504798465, "grad_norm": 20.5827327935098, "learning_rate": 3.592355570776984e-08, "logits/chosen": -1.173332929611206, "logits/rejected": -1.2609224319458008, "logps/chosen": -360.0851745605469, "logps/rejected": -583.3133544921875, "loss": 0.4195, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4041774272918701, "rewards/margins": 2.1920089721679688, "rewards/rejected": -3.5961861610412598, "step": 3520 }, { "epoch": 0.8469289827255279, "grad_norm": 10.471012375510664, "learning_rate": 3.484967450502904e-08, "logits/chosen": -1.1107840538024902, "logits/rejected": -1.2250497341156006, "logps/chosen": -374.8684997558594, "logps/rejected": -664.1340942382812, "loss": 0.4038, "rewards/accuracies": 0.875, "rewards/chosen": -1.7912622690200806, "rewards/margins": 2.5201168060302734, "rewards/rejected": -4.3113789558410645, "step": 3530 }, { "epoch": 0.8493282149712092, "grad_norm": 20.111970761124727, "learning_rate": 3.3790885242841296e-08, "logits/chosen": -1.0951917171478271, "logits/rejected": -1.1834380626678467, "logps/chosen": -451.64190673828125, "logps/rejected": -770.8523559570312, "loss": 0.3789, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.0603652000427246, "rewards/margins": 3.2534384727478027, "rewards/rejected": -5.313803672790527, "step": 3540 }, { "epoch": 0.8517274472168906, "grad_norm": 15.121338349842414, "learning_rate": 3.274726219106677e-08, "logits/chosen": -1.051578402519226, "logits/rejected": -1.135506272315979, "logps/chosen": -485.54840087890625, "logps/rejected": -721.75439453125, "loss": 0.4442, "rewards/accuracies": 0.75, "rewards/chosen": -2.1253767013549805, "rewards/margins": 2.3960883617401123, "rewards/rejected": -4.521464824676514, "step": 3550 }, { "epoch": 0.8541266794625719, "grad_norm": 15.152890802383466, "learning_rate": 3.171887855571642e-08, "logits/chosen": -1.2348651885986328, "logits/rejected": -1.2165499925613403, "logps/chosen": -395.697509765625, "logps/rejected": -543.8568115234375, "loss": 0.3826, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7078378200531006, "rewards/margins": 1.5240461826324463, "rewards/rejected": -3.231884002685547, "step": 3560 }, { "epoch": 0.8565259117082533, "grad_norm": 24.44597593565566, "learning_rate": 3.070580647381643e-08, "logits/chosen": -1.1522376537322998, "logits/rejected": -1.2483961582183838, "logps/chosen": -406.069091796875, "logps/rejected": -749.8062133789062, "loss": 0.4548, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.753334403038025, "rewards/margins": 3.3889122009277344, "rewards/rejected": -5.142246246337891, "step": 3570 }, { "epoch": 0.8589251439539347, "grad_norm": 15.075419291770242, "learning_rate": 2.9708117008348576e-08, "logits/chosen": -1.2388461828231812, "logits/rejected": -1.3630428314208984, "logps/chosen": -477.43585205078125, "logps/rejected": -610.2297973632812, "loss": 0.3969, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.715201735496521, "rewards/margins": 1.8291780948638916, "rewards/rejected": -3.544379711151123, "step": 3580 }, { "epoch": 0.8613243761996161, "grad_norm": 11.602973232421764, "learning_rate": 2.8725880143264992e-08, "logits/chosen": -1.19898521900177, "logits/rejected": -1.2185986042022705, "logps/chosen": -449.32598876953125, "logps/rejected": -633.5596923828125, "loss": 0.465, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.1087489128112793, "rewards/margins": 1.4358699321746826, "rewards/rejected": -3.544618606567383, "step": 3590 }, { "epoch": 0.8637236084452975, "grad_norm": 21.3005453219283, "learning_rate": 2.775916477857948e-08, "logits/chosen": -1.1370158195495605, "logits/rejected": -1.1749649047851562, "logps/chosen": -402.0398254394531, "logps/rejected": -587.6260375976562, "loss": 0.413, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9864225387573242, "rewards/margins": 1.8402111530303955, "rewards/rejected": -3.8266334533691406, "step": 3600 }, { "epoch": 0.8661228406909789, "grad_norm": 15.387678499543219, "learning_rate": 2.680803872553408e-08, "logits/chosen": -1.2096471786499023, "logits/rejected": -1.3197107315063477, "logps/chosen": -445.0570373535156, "logps/rejected": -827.9861450195312, "loss": 0.4225, "rewards/accuracies": 0.875, "rewards/chosen": -1.9052565097808838, "rewards/margins": 4.059884071350098, "rewards/rejected": -5.965140342712402, "step": 3610 }, { "epoch": 0.8685220729366603, "grad_norm": 24.644670995274364, "learning_rate": 2.5872568701842706e-08, "logits/chosen": -1.2481223344802856, "logits/rejected": -1.3282666206359863, "logps/chosen": -388.14715576171875, "logps/rejected": -630.1165161132812, "loss": 0.4617, "rewards/accuracies": 0.75, "rewards/chosen": -1.7502208948135376, "rewards/margins": 2.1968894004821777, "rewards/rejected": -3.9471099376678467, "step": 3620 }, { "epoch": 0.8709213051823417, "grad_norm": 20.103907535930773, "learning_rate": 2.495282032701096e-08, "logits/chosen": -1.1463677883148193, "logits/rejected": -1.3338615894317627, "logps/chosen": -351.9326477050781, "logps/rejected": -538.684814453125, "loss": 0.4117, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.522869348526001, "rewards/margins": 2.161909580230713, "rewards/rejected": -3.684778928756714, "step": 3630 }, { "epoch": 0.8733205374280231, "grad_norm": 14.949192367966893, "learning_rate": 2.4048858117733133e-08, "logits/chosen": -1.2300177812576294, "logits/rejected": -1.345139741897583, "logps/chosen": -454.37628173828125, "logps/rejected": -743.2384033203125, "loss": 0.3729, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8625742197036743, "rewards/margins": 3.337106227874756, "rewards/rejected": -5.199681282043457, "step": 3640 }, { "epoch": 0.8757197696737045, "grad_norm": 17.259398265317675, "learning_rate": 2.3160745483366938e-08, "logits/chosen": -1.2201354503631592, "logits/rejected": -1.259948492050171, "logps/chosen": -431.505126953125, "logps/rejected": -647.8377685546875, "loss": 0.4208, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0006277561187744, "rewards/margins": 1.8061168193817139, "rewards/rejected": -3.806744337081909, "step": 3650 }, { "epoch": 0.8781190019193857, "grad_norm": 26.593038878856742, "learning_rate": 2.2288544721485197e-08, "logits/chosen": -1.1473274230957031, "logits/rejected": -1.1531964540481567, "logps/chosen": -367.77691650390625, "logps/rejected": -670.8995361328125, "loss": 0.4029, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.509234070777893, "rewards/margins": 2.8308663368225098, "rewards/rejected": -4.340100288391113, "step": 3660 }, { "epoch": 0.8805182341650671, "grad_norm": 17.09126226618081, "learning_rate": 2.1432317013506117e-08, "logits/chosen": -1.2680007219314575, "logits/rejected": -1.3855565786361694, "logps/chosen": -459.4864807128906, "logps/rejected": -617.2786254882812, "loss": 0.4376, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9778245687484741, "rewards/margins": 1.9757041931152344, "rewards/rejected": -3.953528881072998, "step": 3670 }, { "epoch": 0.8829174664107485, "grad_norm": 22.964518655362124, "learning_rate": 2.0592122420401704e-08, "logits/chosen": -1.0826637744903564, "logits/rejected": -1.218787431716919, "logps/chosen": -442.7049865722656, "logps/rejected": -622.1304931640625, "loss": 0.4426, "rewards/accuracies": 0.75, "rewards/chosen": -2.082751750946045, "rewards/margins": 1.7740720510482788, "rewards/rejected": -3.8568243980407715, "step": 3680 }, { "epoch": 0.8853166986564299, "grad_norm": 16.601339844348978, "learning_rate": 1.976801987848459e-08, "logits/chosen": -1.2238892316818237, "logits/rejected": -1.2646934986114502, "logps/chosen": -454.4891052246094, "logps/rejected": -779.9896850585938, "loss": 0.4293, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.8976598978042603, "rewards/margins": 3.114412784576416, "rewards/rejected": -5.012072563171387, "step": 3690 }, { "epoch": 0.8877159309021113, "grad_norm": 17.11872715651324, "learning_rate": 1.8960067195273987e-08, "logits/chosen": -1.2767010927200317, "logits/rejected": -1.3807860612869263, "logps/chosen": -399.5985412597656, "logps/rejected": -693.5151977539062, "loss": 0.3976, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8493973016738892, "rewards/margins": 2.9295060634613037, "rewards/rejected": -4.778903484344482, "step": 3700 }, { "epoch": 0.8901151631477927, "grad_norm": 16.6413432588174, "learning_rate": 1.816832104544072e-08, "logits/chosen": -1.098815679550171, "logits/rejected": -1.1592333316802979, "logps/chosen": -470.98553466796875, "logps/rejected": -625.4569091796875, "loss": 0.3886, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.982060432434082, "rewards/margins": 1.7682058811187744, "rewards/rejected": -3.7502663135528564, "step": 3710 }, { "epoch": 0.8925143953934741, "grad_norm": 11.891797748013655, "learning_rate": 1.7392836966831553e-08, "logits/chosen": -1.0697181224822998, "logits/rejected": -1.194319486618042, "logps/chosen": -475.51898193359375, "logps/rejected": -717.6954345703125, "loss": 0.3918, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.0813956260681152, "rewards/margins": 2.7703046798706055, "rewards/rejected": -4.851700782775879, "step": 3720 }, { "epoch": 0.8949136276391555, "grad_norm": 17.887300709912445, "learning_rate": 1.663366935657373e-08, "logits/chosen": -1.2668213844299316, "logits/rejected": -1.4020304679870605, "logps/chosen": -410.6656188964844, "logps/rejected": -631.67041015625, "loss": 0.4465, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.830963373184204, "rewards/margins": 2.1420297622680664, "rewards/rejected": -3.9729931354522705, "step": 3730 }, { "epoch": 0.8973128598848369, "grad_norm": 21.650200131935442, "learning_rate": 1.5890871467258898e-08, "logits/chosen": -1.0380961894989014, "logits/rejected": -1.1259255409240723, "logps/chosen": -538.32568359375, "logps/rejected": -709.1238403320312, "loss": 0.4203, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.206559896469116, "rewards/margins": 2.0453429222106934, "rewards/rejected": -4.2519025802612305, "step": 3740 }, { "epoch": 0.8997120921305183, "grad_norm": 12.405877408803793, "learning_rate": 1.5164495403207967e-08, "logits/chosen": -1.2166404724121094, "logits/rejected": -1.2211335897445679, "logps/chosen": -480.22528076171875, "logps/rejected": -792.5941162109375, "loss": 0.3954, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.314863443374634, "rewards/margins": 2.8861892223358154, "rewards/rejected": -5.201052665710449, "step": 3750 }, { "epoch": 0.9021113243761996, "grad_norm": 12.728466226110546, "learning_rate": 1.4454592116815962e-08, "logits/chosen": -1.1239063739776611, "logits/rejected": -1.1524550914764404, "logps/chosen": -412.11602783203125, "logps/rejected": -646.6449584960938, "loss": 0.3604, "rewards/accuracies": 0.875, "rewards/chosen": -1.66254460811615, "rewards/margins": 2.2538654804229736, "rewards/rejected": -3.916410446166992, "step": 3760 }, { "epoch": 0.904510556621881, "grad_norm": 11.308755733612493, "learning_rate": 1.3761211404977934e-08, "logits/chosen": -1.2462382316589355, "logits/rejected": -1.2617241144180298, "logps/chosen": -481.52642822265625, "logps/rejected": -763.2169189453125, "loss": 0.3466, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.5226385593414307, "rewards/margins": 2.925380229949951, "rewards/rejected": -5.448019504547119, "step": 3770 }, { "epoch": 0.9069097888675623, "grad_norm": 20.409261229736146, "learning_rate": 1.3084401905596177e-08, "logits/chosen": -1.1374547481536865, "logits/rejected": -1.3039876222610474, "logps/chosen": -499.5015563964844, "logps/rejected": -709.859375, "loss": 0.4434, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0284790992736816, "rewards/margins": 2.706104040145874, "rewards/rejected": -4.734583377838135, "step": 3780 }, { "epoch": 0.9093090211132437, "grad_norm": 17.1782781112593, "learning_rate": 1.2424211094168053e-08, "logits/chosen": -1.1101362705230713, "logits/rejected": -1.2460038661956787, "logps/chosen": -517.1344604492188, "logps/rejected": -742.2302856445312, "loss": 0.4041, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.067178726196289, "rewards/margins": 2.329007625579834, "rewards/rejected": -4.396185874938965, "step": 3790 }, { "epoch": 0.9117082533589251, "grad_norm": 30.065326876665342, "learning_rate": 1.1780685280456143e-08, "logits/chosen": -1.237917184829712, "logits/rejected": -1.3036408424377441, "logps/chosen": -539.2681274414062, "logps/rejected": -925.9050903320312, "loss": 0.4439, "rewards/accuracies": 0.875, "rewards/chosen": -2.540894031524658, "rewards/margins": 3.8281638622283936, "rewards/rejected": -6.369057655334473, "step": 3800 }, { "epoch": 0.9141074856046065, "grad_norm": 20.319800037171195, "learning_rate": 1.1153869605239564e-08, "logits/chosen": -1.231994390487671, "logits/rejected": -1.3546103239059448, "logps/chosen": -441.52520751953125, "logps/rejected": -568.5699462890625, "loss": 0.4091, "rewards/accuracies": 0.875, "rewards/chosen": -1.8595590591430664, "rewards/margins": 1.6289126873016357, "rewards/rejected": -3.488471508026123, "step": 3810 }, { "epoch": 0.9165067178502879, "grad_norm": 17.420902765437226, "learning_rate": 1.0543808037147606e-08, "logits/chosen": -1.2463701963424683, "logits/rejected": -1.2932254076004028, "logps/chosen": -475.19195556640625, "logps/rejected": -830.6253051757812, "loss": 0.394, "rewards/accuracies": 0.875, "rewards/chosen": -2.176619052886963, "rewards/margins": 3.5470027923583984, "rewards/rejected": -5.7236223220825195, "step": 3820 }, { "epoch": 0.9189059500959693, "grad_norm": 13.877437771957279, "learning_rate": 9.95054336957557e-09, "logits/chosen": -1.245715618133545, "logits/rejected": -1.245986819267273, "logps/chosen": -431.06024169921875, "logps/rejected": -607.4310302734375, "loss": 0.4043, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.873225450515747, "rewards/margins": 1.7056442499160767, "rewards/rejected": -3.578869581222534, "step": 3830 }, { "epoch": 0.9213051823416507, "grad_norm": 22.780099057725195, "learning_rate": 9.37411721768286e-09, "logits/chosen": -1.3027960062026978, "logits/rejected": -1.3543469905853271, "logps/chosen": -500.34527587890625, "logps/rejected": -797.4141845703125, "loss": 0.3995, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.248213291168213, "rewards/margins": 2.695244312286377, "rewards/rejected": -4.943457126617432, "step": 3840 }, { "epoch": 0.9237044145873321, "grad_norm": 18.879115732312695, "learning_rate": 8.81457001547392e-09, "logits/chosen": -1.179421067237854, "logits/rejected": -1.2008545398712158, "logps/chosen": -445.45611572265625, "logps/rejected": -636.3636474609375, "loss": 0.3706, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.106031894683838, "rewards/margins": 1.8034794330596924, "rewards/rejected": -3.909511089324951, "step": 3850 }, { "epoch": 0.9261036468330134, "grad_norm": 13.946836764722176, "learning_rate": 8.271941012961942e-09, "logits/chosen": -1.1962236166000366, "logits/rejected": -1.1757996082305908, "logps/chosen": -434.50238037109375, "logps/rejected": -851.7151489257812, "loss": 0.4029, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2081894874572754, "rewards/margins": 3.6091766357421875, "rewards/rejected": -5.817366600036621, "step": 3860 }, { "epoch": 0.9285028790786948, "grad_norm": 21.809897538914896, "learning_rate": 7.746268273415568e-09, "logits/chosen": -1.3298779726028442, "logits/rejected": -1.2550171613693237, "logps/chosen": -448.08612060546875, "logps/rejected": -611.9649658203125, "loss": 0.3966, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8825676441192627, "rewards/margins": 1.1555382013320923, "rewards/rejected": -3.0381054878234863, "step": 3870 }, { "epoch": 0.9309021113243762, "grad_norm": 13.467439618498904, "learning_rate": 7.237588670689076e-09, "logits/chosen": -1.2014961242675781, "logits/rejected": -1.348229169845581, "logps/chosen": -491.67352294921875, "logps/rejected": -760.8187866210938, "loss": 0.3867, "rewards/accuracies": 0.75, "rewards/chosen": -2.231884479522705, "rewards/margins": 3.0948691368103027, "rewards/rejected": -5.32675313949585, "step": 3880 }, { "epoch": 0.9333013435700576, "grad_norm": 17.106585504517614, "learning_rate": 6.745937886635606e-09, "logits/chosen": -1.239768624305725, "logits/rejected": -1.2870354652404785, "logps/chosen": -483.9998474121094, "logps/rejected": -888.3824462890625, "loss": 0.4068, "rewards/accuracies": 0.875, "rewards/chosen": -2.029411554336548, "rewards/margins": 4.014785289764404, "rewards/rejected": -6.044196128845215, "step": 3890 }, { "epoch": 0.935700575815739, "grad_norm": 17.65443575752541, "learning_rate": 6.271350408604409e-09, "logits/chosen": -1.2722991704940796, "logits/rejected": -1.2935806512832642, "logps/chosen": -389.75628662109375, "logps/rejected": -633.9022216796875, "loss": 0.3956, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.7770601511001587, "rewards/margins": 2.264246702194214, "rewards/rejected": -4.041306495666504, "step": 3900 }, { "epoch": 0.9380998080614203, "grad_norm": 12.612133126164304, "learning_rate": 5.813859527021487e-09, "logits/chosen": -1.22406804561615, "logits/rejected": -1.323305606842041, "logps/chosen": -508.7425842285156, "logps/rejected": -782.00830078125, "loss": 0.338, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.409405469894409, "rewards/margins": 3.0094313621520996, "rewards/rejected": -5.418837070465088, "step": 3910 }, { "epoch": 0.9404990403071017, "grad_norm": 13.294898316790011, "learning_rate": 5.373497333054616e-09, "logits/chosen": -1.2985012531280518, "logits/rejected": -1.3380292654037476, "logps/chosen": -493.2757873535156, "logps/rejected": -623.1602783203125, "loss": 0.4384, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.2983577251434326, "rewards/margins": 1.3695790767669678, "rewards/rejected": -3.6679370403289795, "step": 3920 }, { "epoch": 0.9428982725527831, "grad_norm": 15.284137135135785, "learning_rate": 4.950294716362213e-09, "logits/chosen": -1.2158329486846924, "logits/rejected": -1.3253077268600464, "logps/chosen": -508.41424560546875, "logps/rejected": -638.3739624023438, "loss": 0.4209, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2684988975524902, "rewards/margins": 1.3235927820205688, "rewards/rejected": -3.5920920372009277, "step": 3930 }, { "epoch": 0.9452975047984645, "grad_norm": 15.219945748745712, "learning_rate": 4.544281362926422e-09, "logits/chosen": -1.1814640760421753, "logits/rejected": -1.213180422782898, "logps/chosen": -480.6612854003906, "logps/rejected": -714.3478393554688, "loss": 0.443, "rewards/accuracies": 0.875, "rewards/chosen": -1.8659789562225342, "rewards/margins": 2.4482996463775635, "rewards/rejected": -4.314279079437256, "step": 3940 }, { "epoch": 0.9476967370441459, "grad_norm": 12.199705253251745, "learning_rate": 4.15548575297095e-09, "logits/chosen": -1.1539068222045898, "logits/rejected": -1.2610228061676025, "logps/chosen": -470.3968811035156, "logps/rejected": -807.7662353515625, "loss": 0.3508, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.2147834300994873, "rewards/margins": 3.456162214279175, "rewards/rejected": -5.6709465980529785, "step": 3950 }, { "epoch": 0.9500959692898272, "grad_norm": 9.529613600157132, "learning_rate": 3.7839351589631366e-09, "logits/chosen": -1.2287517786026, "logits/rejected": -1.1175611019134521, "logps/chosen": -470.9747009277344, "logps/rejected": -738.1583251953125, "loss": 0.4044, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.675825595855713, "rewards/margins": 2.222598075866699, "rewards/rejected": -4.89842414855957, "step": 3960 }, { "epoch": 0.9524952015355086, "grad_norm": 18.605428038684874, "learning_rate": 3.4296556437010405e-09, "logits/chosen": -1.2765130996704102, "logits/rejected": -1.2980254888534546, "logps/chosen": -460.11541748046875, "logps/rejected": -656.4251098632812, "loss": 0.4182, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.518393039703369, "rewards/margins": 2.0247464179992676, "rewards/rejected": -4.543139457702637, "step": 3970 }, { "epoch": 0.95489443378119, "grad_norm": 14.363883415576543, "learning_rate": 3.092672058485124e-09, "logits/chosen": -1.3632985353469849, "logits/rejected": -1.3597389459609985, "logps/chosen": -513.4880981445312, "logps/rejected": -859.6558837890625, "loss": 0.4086, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.856234550476074, "rewards/margins": 3.326939821243286, "rewards/rejected": -6.183174133300781, "step": 3980 }, { "epoch": 0.9572936660268714, "grad_norm": 24.170474346453393, "learning_rate": 2.7730080413750356e-09, "logits/chosen": -1.1727163791656494, "logits/rejected": -1.296337604522705, "logps/chosen": -467.4671325683594, "logps/rejected": -634.9149169921875, "loss": 0.4019, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.021527051925659, "rewards/margins": 1.6675231456756592, "rewards/rejected": -3.6890506744384766, "step": 3990 }, { "epoch": 0.9596928982725528, "grad_norm": 15.145186840475311, "learning_rate": 2.4706860155316033e-09, "logits/chosen": -1.1688404083251953, "logits/rejected": -1.2660518884658813, "logps/chosen": -569.9762573242188, "logps/rejected": -844.1318359375, "loss": 0.3981, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.3475515842437744, "rewards/margins": 2.774146556854248, "rewards/rejected": -5.121697902679443, "step": 4000 }, { "epoch": 0.9620921305182342, "grad_norm": 20.980446798559257, "learning_rate": 2.185727187643843e-09, "logits/chosen": -1.2156826257705688, "logits/rejected": -1.2687807083129883, "logps/chosen": -427.87774658203125, "logps/rejected": -791.0867309570312, "loss": 0.4573, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.1100106239318848, "rewards/margins": 3.6505370140075684, "rewards/rejected": -5.7605485916137695, "step": 4010 }, { "epoch": 0.9644913627639156, "grad_norm": 26.09657808054691, "learning_rate": 1.9181515464413434e-09, "logits/chosen": -1.0822070837020874, "logits/rejected": -1.1274657249450684, "logps/chosen": -565.4129028320312, "logps/rejected": -877.5753784179688, "loss": 0.3914, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.3460755348205566, "rewards/margins": 3.1658713817596436, "rewards/rejected": -5.511946678161621, "step": 4020 }, { "epoch": 0.966890595009597, "grad_norm": 20.358263015517405, "learning_rate": 1.6679778612923302e-09, "logits/chosen": -1.2023911476135254, "logits/rejected": -1.3535006046295166, "logps/chosen": -511.30792236328125, "logps/rejected": -691.3760375976562, "loss": 0.3992, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.373955011367798, "rewards/margins": 1.693084716796875, "rewards/rejected": -4.0670390129089355, "step": 4030 }, { "epoch": 0.9692898272552783, "grad_norm": 17.09466339113468, "learning_rate": 1.43522368088686e-09, "logits/chosen": -1.2254010438919067, "logits/rejected": -1.3339544534683228, "logps/chosen": -497.1849670410156, "logps/rejected": -889.703125, "loss": 0.48, "rewards/accuracies": 0.75, "rewards/chosen": -2.4439101219177246, "rewards/margins": 3.9405925273895264, "rewards/rejected": -6.384502410888672, "step": 4040 }, { "epoch": 0.9716890595009597, "grad_norm": 17.680515818931642, "learning_rate": 1.2199053320059993e-09, "logits/chosen": -1.2316696643829346, "logits/rejected": -1.2435214519500732, "logps/chosen": -472.69000244140625, "logps/rejected": -706.5465087890625, "loss": 0.3977, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.054659605026245, "rewards/margins": 2.194859504699707, "rewards/rejected": -4.249519348144531, "step": 4050 }, { "epoch": 0.974088291746641, "grad_norm": 19.737454721261717, "learning_rate": 1.0220379183764338e-09, "logits/chosen": -1.2599509954452515, "logits/rejected": -1.2703198194503784, "logps/chosen": -364.81658935546875, "logps/rejected": -650.0493774414062, "loss": 0.3815, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5390068292617798, "rewards/margins": 2.7725987434387207, "rewards/rejected": -4.311606407165527, "step": 4060 }, { "epoch": 0.9764875239923224, "grad_norm": 20.629402096960852, "learning_rate": 8.416353196111503e-10, "logits/chosen": -1.2555500268936157, "logits/rejected": -1.2429850101470947, "logps/chosen": -496.2998046875, "logps/rejected": -772.2160034179688, "loss": 0.4317, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.5762507915496826, "rewards/margins": 2.843451499938965, "rewards/rejected": -5.419702053070068, "step": 4070 }, { "epoch": 0.9788867562380038, "grad_norm": 21.54194458199129, "learning_rate": 6.787101902356873e-10, "logits/chosen": -1.3214588165283203, "logits/rejected": -1.3182449340820312, "logps/chosen": -515.1784057617188, "logps/rejected": -786.1739501953125, "loss": 0.4275, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.5369973182678223, "rewards/margins": 2.521944999694824, "rewards/rejected": -5.058941841125488, "step": 4080 }, { "epoch": 0.9812859884836852, "grad_norm": 22.64402950848799, "learning_rate": 5.332739588005953e-10, "logits/chosen": -1.2550867795944214, "logits/rejected": -1.322644591331482, "logps/chosen": -376.3187561035156, "logps/rejected": -727.2735595703125, "loss": 0.4059, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7436511516571045, "rewards/margins": 3.295630693435669, "rewards/rejected": -5.039282321929932, "step": 4090 }, { "epoch": 0.9836852207293666, "grad_norm": 22.391181670910232, "learning_rate": 4.053368270797164e-10, "logits/chosen": -1.2337408065795898, "logits/rejected": -1.2769973278045654, "logps/chosen": -468.5732421875, "logps/rejected": -767.9414672851562, "loss": 0.4017, "rewards/accuracies": 0.75, "rewards/chosen": -2.362514019012451, "rewards/margins": 3.0355606079101562, "rewards/rejected": -5.398074150085449, "step": 4100 }, { "epoch": 0.986084452975048, "grad_norm": 14.32266939448474, "learning_rate": 2.949077693545354e-10, "logits/chosen": -1.1863398551940918, "logits/rejected": -1.2951616048812866, "logps/chosen": -504.52801513671875, "logps/rejected": -719.0045166015625, "loss": 0.4678, "rewards/accuracies": 0.75, "rewards/chosen": -2.333911895751953, "rewards/margins": 1.8691895008087158, "rewards/rejected": -4.203102111816406, "step": 4110 }, { "epoch": 0.9884836852207294, "grad_norm": 23.976635069300592, "learning_rate": 2.0199453178471047e-10, "logits/chosen": -1.126481294631958, "logits/rejected": -1.2799094915390015, "logps/chosen": -538.2355346679688, "logps/rejected": -603.18994140625, "loss": 0.4332, "rewards/accuracies": 0.875, "rewards/chosen": -2.318129062652588, "rewards/margins": 1.0544296503067017, "rewards/rejected": -3.372559070587158, "step": 4120 }, { "epoch": 0.9908829174664108, "grad_norm": 40.95175275330808, "learning_rate": 1.266036318647301e-10, "logits/chosen": -1.2519207000732422, "logits/rejected": -1.3095006942749023, "logps/chosen": -540.7717895507812, "logps/rejected": -778.435791015625, "loss": 0.4474, "rewards/accuracies": 0.875, "rewards/chosen": -2.4916446208953857, "rewards/margins": 2.5904107093811035, "rewards/rejected": -5.08205509185791, "step": 4130 }, { "epoch": 0.9932821497120922, "grad_norm": 17.066173818194596, "learning_rate": 6.874035796672339e-11, "logits/chosen": -1.2024726867675781, "logits/rejected": -1.2957074642181396, "logps/chosen": -512.9348754882812, "logps/rejected": -842.3238525390625, "loss": 0.4125, "rewards/accuracies": 0.875, "rewards/chosen": -2.164402961730957, "rewards/margins": 3.843069076538086, "rewards/rejected": -6.007472515106201, "step": 4140 }, { "epoch": 0.9956813819577736, "grad_norm": 20.618667625472924, "learning_rate": 2.8408768969423458e-11, "logits/chosen": -1.2141971588134766, "logits/rejected": -1.2368704080581665, "logps/chosen": -452.35345458984375, "logps/rejected": -660.5663452148438, "loss": 0.3909, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.867018699645996, "rewards/margins": 2.0285303592681885, "rewards/rejected": -3.8955492973327637, "step": 4150 }, { "epoch": 0.9980806142034548, "grad_norm": 21.52118343490649, "learning_rate": 5.611693973617271e-12, "logits/chosen": -1.2614226341247559, "logits/rejected": -1.27662193775177, "logps/chosen": -454.05596923828125, "logps/rejected": -670.5745239257812, "loss": 0.4453, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.251619338989258, "rewards/margins": 2.0149483680725098, "rewards/rejected": -4.266567707061768, "step": 4160 }, { "epoch": 1.0, "step": 4168, "total_flos": 0.0, "train_loss": 0.4679888197991303, "train_runtime": 14179.2142, "train_samples_per_second": 9.406, "train_steps_per_second": 0.294 } ], "logging_steps": 10, "max_steps": 4168, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }