diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6297 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 5000, + "global_step": 4168, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002399232245681382, + "grad_norm": 3.9086405364003305, + "learning_rate": 1.199040767386091e-09, + "logits/chosen": -0.9392852187156677, + "logits/rejected": -0.9925774335861206, + "logps/chosen": -164.85171508789062, + "logps/rejected": -169.34266662597656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0023992322456813818, + "grad_norm": 4.318184225673836, + "learning_rate": 1.199040767386091e-08, + "logits/chosen": -0.8653285503387451, + "logits/rejected": -1.0646977424621582, + "logps/chosen": -367.5494384765625, + "logps/rejected": -308.0057067871094, + "loss": 0.6931, + "rewards/accuracies": 0.3611111044883728, + "rewards/chosen": 0.00055171106941998, + "rewards/margins": 0.00021127487707417458, + "rewards/rejected": 0.0003404362651053816, + "step": 10 + }, + { + "epoch": 0.0047984644913627635, + "grad_norm": 4.384399942785772, + "learning_rate": 2.398081534772182e-08, + "logits/chosen": -0.9145099520683289, + "logits/rejected": -0.9615824818611145, + "logps/chosen": -254.70645141601562, + "logps/rejected": -225.65023803710938, + "loss": 0.6933, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0004928931593894958, + "rewards/margins": 0.0004294395330362022, + "rewards/rejected": 6.345368456095457e-05, + "step": 20 + }, + { + "epoch": 0.007197696737044146, + "grad_norm": 4.1919489271249395, + "learning_rate": 3.597122302158273e-08, + "logits/chosen": -1.0393908023834229, + "logits/rejected": -1.1211938858032227, + "logps/chosen": -247.6179962158203, + "logps/rejected": -250.74832153320312, + "loss": 0.6931, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.0005728682735934854, + "rewards/margins": -0.0005012283218093216, + "rewards/rejected": -7.164000999182463e-05, + "step": 30 + }, + { + "epoch": 0.009596928982725527, + "grad_norm": 4.043349234918003, + "learning_rate": 4.796163069544364e-08, + "logits/chosen": -1.0382745265960693, + "logits/rejected": -1.1404989957809448, + "logps/chosen": -246.5960693359375, + "logps/rejected": -238.99038696289062, + "loss": 0.6933, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0003935880376957357, + "rewards/margins": 0.0007454471779055893, + "rewards/rejected": -0.001139035215601325, + "step": 40 + }, + { + "epoch": 0.01199616122840691, + "grad_norm": 4.337621377747828, + "learning_rate": 5.995203836930455e-08, + "logits/chosen": -0.9566876292228699, + "logits/rejected": -1.0265729427337646, + "logps/chosen": -273.5587463378906, + "logps/rejected": -238.2271728515625, + "loss": 0.6931, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0002776079345494509, + "rewards/margins": -0.0013704797020182014, + "rewards/rejected": 0.0010928716510534286, + "step": 50 + }, + { + "epoch": 0.014395393474088292, + "grad_norm": 4.332693802131573, + "learning_rate": 7.194244604316546e-08, + "logits/chosen": -1.14139723777771, + "logits/rejected": -1.063253402709961, + "logps/chosen": -291.4471130371094, + "logps/rejected": -265.26800537109375, + "loss": 0.6931, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.0003930005768779665, + "rewards/margins": 0.00029442697996273637, + "rewards/rejected": 9.857374243438244e-05, + "step": 60 + }, + { + "epoch": 0.016794625719769675, + "grad_norm": 3.9392376744722797, + "learning_rate": 8.393285371702638e-08, + "logits/chosen": -0.7830671072006226, + "logits/rejected": -0.8284071087837219, + "logps/chosen": -280.4967346191406, + "logps/rejected": -269.8634033203125, + "loss": 0.6934, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00010640527762006968, + "rewards/margins": -0.00039604370249435306, + "rewards/rejected": 0.00028963852673768997, + "step": 70 + }, + { + "epoch": 0.019193857965451054, + "grad_norm": 4.275834970816185, + "learning_rate": 9.592326139088728e-08, + "logits/chosen": -1.1247626543045044, + "logits/rejected": -0.8464676141738892, + "logps/chosen": -203.01101684570312, + "logps/rejected": -241.64547729492188, + "loss": 0.6931, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.00033823197009041905, + "rewards/margins": 0.0008404625696130097, + "rewards/rejected": -0.0005022305413149297, + "step": 80 + }, + { + "epoch": 0.021593090211132437, + "grad_norm": 4.009980090025205, + "learning_rate": 1.0791366906474819e-07, + "logits/chosen": -1.128251552581787, + "logits/rejected": -1.1966060400009155, + "logps/chosen": -348.4684143066406, + "logps/rejected": -300.92156982421875, + "loss": 0.6931, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.00025963489315472543, + "rewards/margins": 0.0006502953474409878, + "rewards/rejected": -0.0003906603087671101, + "step": 90 + }, + { + "epoch": 0.02399232245681382, + "grad_norm": 4.278362574458803, + "learning_rate": 1.199040767386091e-07, + "logits/chosen": -0.8752719759941101, + "logits/rejected": -0.7615184783935547, + "logps/chosen": -262.26171875, + "logps/rejected": -279.4682312011719, + "loss": 0.6929, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.0001237258838955313, + "rewards/margins": -0.00013651838526129723, + "rewards/rejected": 1.2792646884918213e-05, + "step": 100 + }, + { + "epoch": 0.026391554702495202, + "grad_norm": 3.7292949735641874, + "learning_rate": 1.3189448441247004e-07, + "logits/chosen": -1.054966688156128, + "logits/rejected": -1.089815616607666, + "logps/chosen": -232.7165069580078, + "logps/rejected": -230.30648803710938, + "loss": 0.6926, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0005149660282768309, + "rewards/margins": 0.0010742491576820612, + "rewards/rejected": -0.0015892151277512312, + "step": 110 + }, + { + "epoch": 0.028790786948176585, + "grad_norm": 4.185972544798998, + "learning_rate": 1.4388489208633092e-07, + "logits/chosen": -0.9251031875610352, + "logits/rejected": -1.0560011863708496, + "logps/chosen": -302.79620361328125, + "logps/rejected": -279.6351013183594, + "loss": 0.6928, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0033786073327064514, + "rewards/margins": -0.0015979796880856156, + "rewards/rejected": -0.0017806284595280886, + "step": 120 + }, + { + "epoch": 0.031190019193857964, + "grad_norm": 3.7577616139381282, + "learning_rate": 1.5587529976019183e-07, + "logits/chosen": -1.1069813966751099, + "logits/rejected": -1.0163028240203857, + "logps/chosen": -225.87887573242188, + "logps/rejected": -308.16943359375, + "loss": 0.6922, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0021240042988210917, + "rewards/margins": 0.0019912621937692165, + "rewards/rejected": -0.004115266725420952, + "step": 130 + }, + { + "epoch": 0.03358925143953935, + "grad_norm": 3.9602176902490616, + "learning_rate": 1.6786570743405277e-07, + "logits/chosen": -0.8096126317977905, + "logits/rejected": -0.844383716583252, + "logps/chosen": -278.711181640625, + "logps/rejected": -270.23455810546875, + "loss": 0.6913, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0016858462477102876, + "rewards/margins": 0.004555505700409412, + "rewards/rejected": -0.0062413522973656654, + "step": 140 + }, + { + "epoch": 0.03598848368522073, + "grad_norm": 4.198772547754269, + "learning_rate": 1.7985611510791365e-07, + "logits/chosen": -1.0384037494659424, + "logits/rejected": -1.0555726289749146, + "logps/chosen": -231.3898468017578, + "logps/rejected": -225.4952392578125, + "loss": 0.6919, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0014209033688530326, + "rewards/margins": 0.00509048905223608, + "rewards/rejected": -0.006511392537504435, + "step": 150 + }, + { + "epoch": 0.03838771593090211, + "grad_norm": 4.167240538791073, + "learning_rate": 1.9184652278177456e-07, + "logits/chosen": -0.8518667221069336, + "logits/rejected": -0.9568248987197876, + "logps/chosen": -296.21734619140625, + "logps/rejected": -231.2320098876953, + "loss": 0.6906, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.004124562256038189, + "rewards/margins": 0.004930226132273674, + "rewards/rejected": -0.009054789319634438, + "step": 160 + }, + { + "epoch": 0.040786948176583494, + "grad_norm": 3.692310549028015, + "learning_rate": 2.038369304556355e-07, + "logits/chosen": -0.8354592323303223, + "logits/rejected": -0.8758047819137573, + "logps/chosen": -342.7477111816406, + "logps/rejected": -333.38189697265625, + "loss": 0.6903, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004680985119193792, + "rewards/margins": 0.004498300142586231, + "rewards/rejected": -0.009179284796118736, + "step": 170 + }, + { + "epoch": 0.04318618042226487, + "grad_norm": 4.32405896978214, + "learning_rate": 2.1582733812949638e-07, + "logits/chosen": -1.1283237934112549, + "logits/rejected": -1.1168252229690552, + "logps/chosen": -238.8912353515625, + "logps/rejected": -229.00265502929688, + "loss": 0.6905, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.004386520944535732, + "rewards/margins": 0.007528006099164486, + "rewards/rejected": -0.011914527975022793, + "step": 180 + }, + { + "epoch": 0.04558541266794626, + "grad_norm": 4.666604305995101, + "learning_rate": 2.278177458033573e-07, + "logits/chosen": -0.9106446504592896, + "logits/rejected": -0.9879466891288757, + "logps/chosen": -306.4612121582031, + "logps/rejected": -249.0087890625, + "loss": 0.6894, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.006806717719882727, + "rewards/margins": 0.007770798169076443, + "rewards/rejected": -0.014577515423297882, + "step": 190 + }, + { + "epoch": 0.04798464491362764, + "grad_norm": 3.9918872126977574, + "learning_rate": 2.398081534772182e-07, + "logits/chosen": -0.9901522397994995, + "logits/rejected": -0.928848385810852, + "logps/chosen": -313.17681884765625, + "logps/rejected": -297.6922302246094, + "loss": 0.6885, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.010936335660517216, + "rewards/margins": 0.006564898882061243, + "rewards/rejected": -0.017501235008239746, + "step": 200 + }, + { + "epoch": 0.05038387715930902, + "grad_norm": 4.040620494299144, + "learning_rate": 2.517985611510791e-07, + "logits/chosen": -0.9027220606803894, + "logits/rejected": -0.922700047492981, + "logps/chosen": -230.9945831298828, + "logps/rejected": -255.6648712158203, + "loss": 0.6876, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.010718774050474167, + "rewards/margins": 0.009615534916520119, + "rewards/rejected": -0.020334308966994286, + "step": 210 + }, + { + "epoch": 0.052783109404990404, + "grad_norm": 4.002893355744946, + "learning_rate": 2.637889688249401e-07, + "logits/chosen": -0.8777509927749634, + "logits/rejected": -0.9541767239570618, + "logps/chosen": -312.22064208984375, + "logps/rejected": -314.44476318359375, + "loss": 0.6874, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.018286144360899925, + "rewards/margins": 0.010097065940499306, + "rewards/rejected": -0.02838321030139923, + "step": 220 + }, + { + "epoch": 0.05518234165067178, + "grad_norm": 4.397529514704007, + "learning_rate": 2.7577937649880093e-07, + "logits/chosen": -0.8843205571174622, + "logits/rejected": -0.7930720448493958, + "logps/chosen": -240.90969848632812, + "logps/rejected": -279.2537841796875, + "loss": 0.684, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.013729272410273552, + "rewards/margins": 0.02465725876390934, + "rewards/rejected": -0.03838653117418289, + "step": 230 + }, + { + "epoch": 0.05758157389635317, + "grad_norm": 4.676021615108152, + "learning_rate": 2.8776978417266184e-07, + "logits/chosen": -1.0245535373687744, + "logits/rejected": -1.0780936479568481, + "logps/chosen": -303.7603454589844, + "logps/rejected": -259.3138732910156, + "loss": 0.6808, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.010991424322128296, + "rewards/margins": 0.02684735879302025, + "rewards/rejected": -0.037838783115148544, + "step": 240 + }, + { + "epoch": 0.05998080614203455, + "grad_norm": 4.463678598202064, + "learning_rate": 2.997601918465228e-07, + "logits/chosen": -0.9612535238265991, + "logits/rejected": -1.0222301483154297, + "logps/chosen": -241.61404418945312, + "logps/rejected": -236.07644653320312, + "loss": 0.68, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.021920276805758476, + "rewards/margins": 0.018481746315956116, + "rewards/rejected": -0.04040202870965004, + "step": 250 + }, + { + "epoch": 0.06238003838771593, + "grad_norm": 4.115268408123004, + "learning_rate": 3.1175059952038366e-07, + "logits/chosen": -1.0142881870269775, + "logits/rejected": -0.8710586428642273, + "logps/chosen": -263.0195617675781, + "logps/rejected": -259.115478515625, + "loss": 0.6769, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.021990353241562843, + "rewards/margins": 0.03786135092377663, + "rewards/rejected": -0.05985169857740402, + "step": 260 + }, + { + "epoch": 0.0647792706333973, + "grad_norm": 4.201440552672297, + "learning_rate": 3.2374100719424457e-07, + "logits/chosen": -0.9422982931137085, + "logits/rejected": -1.1441442966461182, + "logps/chosen": -290.52044677734375, + "logps/rejected": -235.48049926757812, + "loss": 0.6748, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.028027933090925217, + "rewards/margins": 0.016811534762382507, + "rewards/rejected": -0.044839464128017426, + "step": 270 + }, + { + "epoch": 0.0671785028790787, + "grad_norm": 4.458570754919466, + "learning_rate": 3.3573141486810554e-07, + "logits/chosen": -1.0597388744354248, + "logits/rejected": -1.0107687711715698, + "logps/chosen": -299.5600891113281, + "logps/rejected": -287.019287109375, + "loss": 0.6676, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.03363611549139023, + "rewards/margins": 0.046159304678440094, + "rewards/rejected": -0.07979541271924973, + "step": 280 + }, + { + "epoch": 0.06957773512476008, + "grad_norm": 3.937791865544782, + "learning_rate": 3.477218225419664e-07, + "logits/chosen": -0.9224345088005066, + "logits/rejected": -0.8189966082572937, + "logps/chosen": -291.11492919921875, + "logps/rejected": -267.08172607421875, + "loss": 0.6674, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04258224740624428, + "rewards/margins": 0.05275702476501465, + "rewards/rejected": -0.09533928334712982, + "step": 290 + }, + { + "epoch": 0.07197696737044146, + "grad_norm": 4.881987417549259, + "learning_rate": 3.597122302158273e-07, + "logits/chosen": -1.013051152229309, + "logits/rejected": -1.0528075695037842, + "logps/chosen": -260.43798828125, + "logps/rejected": -280.9106750488281, + "loss": 0.6638, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.061515528708696365, + "rewards/margins": 0.059713393449783325, + "rewards/rejected": -0.12122891843318939, + "step": 300 + }, + { + "epoch": 0.07437619961612284, + "grad_norm": 3.887121301077397, + "learning_rate": 3.7170263788968827e-07, + "logits/chosen": -0.8997815847396851, + "logits/rejected": -0.9898494482040405, + "logps/chosen": -277.71112060546875, + "logps/rejected": -239.8883514404297, + "loss": 0.6659, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.058370210230350494, + "rewards/margins": 0.08987968415021896, + "rewards/rejected": -0.14824989438056946, + "step": 310 + }, + { + "epoch": 0.07677543186180422, + "grad_norm": 3.767825032617774, + "learning_rate": 3.836930455635491e-07, + "logits/chosen": -1.0142638683319092, + "logits/rejected": -1.0835198163986206, + "logps/chosen": -283.21722412109375, + "logps/rejected": -256.9454040527344, + "loss": 0.6599, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0824267640709877, + "rewards/margins": 0.06453671306371689, + "rewards/rejected": -0.1469634771347046, + "step": 320 + }, + { + "epoch": 0.07917466410748561, + "grad_norm": 4.101417109334993, + "learning_rate": 3.9568345323741003e-07, + "logits/chosen": -0.8915877342224121, + "logits/rejected": -0.7875005006790161, + "logps/chosen": -256.7735595703125, + "logps/rejected": -309.68743896484375, + "loss": 0.648, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.13278505206108093, + "rewards/margins": 0.12722721695899963, + "rewards/rejected": -0.26001226902008057, + "step": 330 + }, + { + "epoch": 0.08157389635316699, + "grad_norm": 4.184212161663267, + "learning_rate": 4.07673860911271e-07, + "logits/chosen": -0.8056583404541016, + "logits/rejected": -0.8734966516494751, + "logps/chosen": -257.9537353515625, + "logps/rejected": -290.716552734375, + "loss": 0.6385, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.15207326412200928, + "rewards/margins": 0.15626882016658783, + "rewards/rejected": -0.3083421289920807, + "step": 340 + }, + { + "epoch": 0.08397312859884837, + "grad_norm": 5.07860606766039, + "learning_rate": 4.1966426858513185e-07, + "logits/chosen": -1.1134240627288818, + "logits/rejected": -1.1105023622512817, + "logps/chosen": -308.1408386230469, + "logps/rejected": -322.1429748535156, + "loss": 0.6317, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37560468912124634, + "rewards/margins": 0.14869387447834015, + "rewards/rejected": -0.5242985486984253, + "step": 350 + }, + { + "epoch": 0.08637236084452975, + "grad_norm": 6.681443438336797, + "learning_rate": 4.3165467625899276e-07, + "logits/chosen": -0.9821847677230835, + "logits/rejected": -1.1448824405670166, + "logps/chosen": -324.4971618652344, + "logps/rejected": -285.352294921875, + "loss": 0.6325, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5944857001304626, + "rewards/margins": 0.16079989075660706, + "rewards/rejected": -0.7552856206893921, + "step": 360 + }, + { + "epoch": 0.08877159309021113, + "grad_norm": 5.547129015189758, + "learning_rate": 4.436450839328537e-07, + "logits/chosen": -0.9680454134941101, + "logits/rejected": -0.8957147598266602, + "logps/chosen": -285.62225341796875, + "logps/rejected": -329.6009826660156, + "loss": 0.6155, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.507738471031189, + "rewards/margins": 0.350941002368927, + "rewards/rejected": -0.858679473400116, + "step": 370 + }, + { + "epoch": 0.09117082533589252, + "grad_norm": 5.005904357466744, + "learning_rate": 4.556354916067146e-07, + "logits/chosen": -1.0903767347335815, + "logits/rejected": -1.0195515155792236, + "logps/chosen": -279.7075500488281, + "logps/rejected": -327.0304870605469, + "loss": 0.5837, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.42896413803100586, + "rewards/margins": 0.4645144045352936, + "rewards/rejected": -0.8934786915779114, + "step": 380 + }, + { + "epoch": 0.0935700575815739, + "grad_norm": 6.3368430821651085, + "learning_rate": 4.676258992805755e-07, + "logits/chosen": -0.9331681132316589, + "logits/rejected": -0.974704384803772, + "logps/chosen": -358.29052734375, + "logps/rejected": -354.2314147949219, + "loss": 0.6125, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9378200769424438, + "rewards/margins": 0.23260822892189026, + "rewards/rejected": -1.1704282760620117, + "step": 390 + }, + { + "epoch": 0.09596928982725528, + "grad_norm": 4.993194981103021, + "learning_rate": 4.796163069544364e-07, + "logits/chosen": -0.9397958517074585, + "logits/rejected": -1.0308669805526733, + "logps/chosen": -327.28485107421875, + "logps/rejected": -373.20538330078125, + "loss": 0.6133, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6638423800468445, + "rewards/margins": 0.6587511897087097, + "rewards/rejected": -1.3225935697555542, + "step": 400 + }, + { + "epoch": 0.09836852207293666, + "grad_norm": 5.627853897652041, + "learning_rate": 4.916067146282974e-07, + "logits/chosen": -1.0697743892669678, + "logits/rejected": -1.02415931224823, + "logps/chosen": -302.128173828125, + "logps/rejected": -373.0263366699219, + "loss": 0.5737, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.543385922908783, + "rewards/margins": 0.5250757932662964, + "rewards/rejected": -1.0684617757797241, + "step": 410 + }, + { + "epoch": 0.10076775431861804, + "grad_norm": 7.180880498197233, + "learning_rate": 4.999992108529978e-07, + "logits/chosen": -0.9345219731330872, + "logits/rejected": -0.9572717547416687, + "logps/chosen": -444.15997314453125, + "logps/rejected": -470.5298767089844, + "loss": 0.6004, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.090321660041809, + "rewards/margins": 0.609116792678833, + "rewards/rejected": -1.6994386911392212, + "step": 420 + }, + { + "epoch": 0.10316698656429943, + "grad_norm": 11.482376711377093, + "learning_rate": 4.999851817115532e-07, + "logits/chosen": -1.1403158903121948, + "logits/rejected": -1.0577712059020996, + "logps/chosen": -349.0794982910156, + "logps/rejected": -424.08642578125, + "loss": 0.598, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9095686078071594, + "rewards/margins": 0.863958477973938, + "rewards/rejected": -1.773526906967163, + "step": 430 + }, + { + "epoch": 0.10556621880998081, + "grad_norm": 5.363358928784856, + "learning_rate": 4.999536171027889e-07, + "logits/chosen": -0.8907009363174438, + "logits/rejected": -0.991034984588623, + "logps/chosen": -319.98046875, + "logps/rejected": -347.8736877441406, + "loss": 0.5812, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6185662150382996, + "rewards/margins": 0.3348569869995117, + "rewards/rejected": -0.9534232020378113, + "step": 440 + }, + { + "epoch": 0.10796545105566219, + "grad_norm": 5.047279311794404, + "learning_rate": 4.999045192408369e-07, + "logits/chosen": -1.0067179203033447, + "logits/rejected": -1.0287652015686035, + "logps/chosen": -324.7417907714844, + "logps/rejected": -358.2411804199219, + "loss": 0.585, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.852948784828186, + "rewards/margins": 0.47164326906204224, + "rewards/rejected": -1.324592113494873, + "step": 450 + }, + { + "epoch": 0.11036468330134357, + "grad_norm": 11.251777016471365, + "learning_rate": 4.998378915697171e-07, + "logits/chosen": -0.9187090992927551, + "logits/rejected": -0.9824856519699097, + "logps/chosen": -357.00994873046875, + "logps/rejected": -421.2936096191406, + "loss": 0.5563, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7373157739639282, + "rewards/margins": 0.7148237824440002, + "rewards/rejected": -1.4521396160125732, + "step": 460 + }, + { + "epoch": 0.11276391554702495, + "grad_norm": 8.58525552938624, + "learning_rate": 4.997537387630958e-07, + "logits/chosen": -1.0368196964263916, + "logits/rejected": -1.0864078998565674, + "logps/chosen": -331.54705810546875, + "logps/rejected": -414.0245666503906, + "loss": 0.5324, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0620375871658325, + "rewards/margins": 0.7675203084945679, + "rewards/rejected": -1.82955801486969, + "step": 470 + }, + { + "epoch": 0.11516314779270634, + "grad_norm": 7.973906104493475, + "learning_rate": 4.996520667239582e-07, + "logits/chosen": -1.2711211442947388, + "logits/rejected": -1.1430588960647583, + "logps/chosen": -344.3868408203125, + "logps/rejected": -475.3819885253906, + "loss": 0.5449, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9513166546821594, + "rewards/margins": 0.8125057220458984, + "rewards/rejected": -1.7638225555419922, + "step": 480 + }, + { + "epoch": 0.11756238003838772, + "grad_norm": 7.29273708493132, + "learning_rate": 4.995328825841939e-07, + "logits/chosen": -0.945563793182373, + "logits/rejected": -0.9637011289596558, + "logps/chosen": -333.838134765625, + "logps/rejected": -502.4449157714844, + "loss": 0.5398, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9119874238967896, + "rewards/margins": 1.7599906921386719, + "rewards/rejected": -2.671978235244751, + "step": 490 + }, + { + "epoch": 0.1199616122840691, + "grad_norm": 8.408190304146231, + "learning_rate": 4.993961947040967e-07, + "logits/chosen": -0.9354039430618286, + "logits/rejected": -1.008681297302246, + "logps/chosen": -389.75677490234375, + "logps/rejected": -416.6995544433594, + "loss": 0.5543, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0735552310943604, + "rewards/margins": 0.6031023263931274, + "rewards/rejected": -1.6766574382781982, + "step": 500 + }, + { + "epoch": 0.12236084452975048, + "grad_norm": 7.547972255202871, + "learning_rate": 4.992420126717784e-07, + "logits/chosen": -1.0642093420028687, + "logits/rejected": -1.0110970735549927, + "logps/chosen": -349.2254333496094, + "logps/rejected": -501.6767578125, + "loss": 0.5416, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.9027541875839233, + "rewards/margins": 1.5682001113891602, + "rewards/rejected": -2.470954418182373, + "step": 510 + }, + { + "epoch": 0.12476007677543186, + "grad_norm": 6.11542459949028, + "learning_rate": 4.990703473024958e-07, + "logits/chosen": -0.8702675104141235, + "logits/rejected": -1.03139066696167, + "logps/chosen": -410.02264404296875, + "logps/rejected": -504.4461364746094, + "loss": 0.5536, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.188328742980957, + "rewards/margins": 1.0218807458877563, + "rewards/rejected": -2.210209608078003, + "step": 520 + }, + { + "epoch": 0.12715930902111325, + "grad_norm": 9.452659503317662, + "learning_rate": 4.98881210637893e-07, + "logits/chosen": -1.1504271030426025, + "logits/rejected": -1.0904886722564697, + "logps/chosen": -299.77197265625, + "logps/rejected": -411.39520263671875, + "loss": 0.5477, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7923839092254639, + "rewards/margins": 0.7800144553184509, + "rewards/rejected": -1.5723984241485596, + "step": 530 + }, + { + "epoch": 0.1295585412667946, + "grad_norm": 15.54174844903523, + "learning_rate": 4.986746159451553e-07, + "logits/chosen": -1.0087683200836182, + "logits/rejected": -1.0048372745513916, + "logps/chosen": -348.1635437011719, + "logps/rejected": -465.615234375, + "loss": 0.5681, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.949160099029541, + "rewards/margins": 1.2487725019454956, + "rewards/rejected": -2.197932720184326, + "step": 540 + }, + { + "epoch": 0.131957773512476, + "grad_norm": 6.811166436392921, + "learning_rate": 4.984505777160795e-07, + "logits/chosen": -0.8339638710021973, + "logits/rejected": -0.8557635545730591, + "logps/chosen": -368.79998779296875, + "logps/rejected": -455.84423828125, + "loss": 0.5638, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7133311629295349, + "rewards/margins": 0.8222800493240356, + "rewards/rejected": -1.5356113910675049, + "step": 550 + }, + { + "epoch": 0.1343570057581574, + "grad_norm": 8.38746715084373, + "learning_rate": 4.982091116660574e-07, + "logits/chosen": -0.9729937314987183, + "logits/rejected": -1.1065596342086792, + "logps/chosen": -269.25994873046875, + "logps/rejected": -294.3720703125, + "loss": 0.5681, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6644600629806519, + "rewards/margins": 0.42904800176620483, + "rewards/rejected": -1.0935081243515015, + "step": 560 + }, + { + "epoch": 0.13675623800383876, + "grad_norm": 24.438576955287925, + "learning_rate": 4.979502347329732e-07, + "logits/chosen": -0.8234192132949829, + "logits/rejected": -0.8349748849868774, + "logps/chosen": -419.22637939453125, + "logps/rejected": -574.6170654296875, + "loss": 0.5309, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4598208665847778, + "rewards/margins": 1.224372148513794, + "rewards/rejected": -2.6841928958892822, + "step": 570 + }, + { + "epoch": 0.13915547024952016, + "grad_norm": 8.10566134085291, + "learning_rate": 4.976739650760151e-07, + "logits/chosen": -1.0753071308135986, + "logits/rejected": -1.1201988458633423, + "logps/chosen": -420.6094665527344, + "logps/rejected": -514.7296142578125, + "loss": 0.5764, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6122499704360962, + "rewards/margins": 1.0379631519317627, + "rewards/rejected": -2.6502132415771484, + "step": 580 + }, + { + "epoch": 0.14155470249520152, + "grad_norm": 6.442606806291236, + "learning_rate": 4.97380322074402e-07, + "logits/chosen": -0.7010489702224731, + "logits/rejected": -0.7692248225212097, + "logps/chosen": -344.9544982910156, + "logps/rejected": -453.3834533691406, + "loss": 0.5796, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.21074640750885, + "rewards/margins": 1.0821633338928223, + "rewards/rejected": -2.292909860610962, + "step": 590 + }, + { + "epoch": 0.14395393474088292, + "grad_norm": 8.67810882207825, + "learning_rate": 4.970693263260237e-07, + "logits/chosen": -0.9984515905380249, + "logits/rejected": -1.0792922973632812, + "logps/chosen": -385.07855224609375, + "logps/rejected": -431.91748046875, + "loss": 0.5352, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.8377297520637512, + "rewards/margins": 0.9412263631820679, + "rewards/rejected": -1.7789561748504639, + "step": 600 + }, + { + "epoch": 0.1463531669865643, + "grad_norm": 19.243246370458866, + "learning_rate": 4.967409996459966e-07, + "logits/chosen": -0.9357202649116516, + "logits/rejected": -0.966041088104248, + "logps/chosen": -379.83709716796875, + "logps/rejected": -436.16021728515625, + "loss": 0.531, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2088489532470703, + "rewards/margins": 0.8029053807258606, + "rewards/rejected": -2.011754274368286, + "step": 610 + }, + { + "epoch": 0.14875239923224567, + "grad_norm": 15.946020238913784, + "learning_rate": 4.963953650651326e-07, + "logits/chosen": -0.836955726146698, + "logits/rejected": -0.9188618659973145, + "logps/chosen": -472.018798828125, + "logps/rejected": -462.2901306152344, + "loss": 0.5215, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2413372993469238, + "rewards/margins": 0.8713501691818237, + "rewards/rejected": -2.112687587738037, + "step": 620 + }, + { + "epoch": 0.15115163147792707, + "grad_norm": 7.979287445807055, + "learning_rate": 4.960324468283248e-07, + "logits/chosen": -1.00057053565979, + "logits/rejected": -1.0442100763320923, + "logps/chosen": -290.08868408203125, + "logps/rejected": -377.97027587890625, + "loss": 0.515, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.6758447885513306, + "rewards/margins": 0.8780001401901245, + "rewards/rejected": -1.5538448095321655, + "step": 630 + }, + { + "epoch": 0.15355086372360843, + "grad_norm": 10.945616218615863, + "learning_rate": 4.956522703928451e-07, + "logits/chosen": -0.978852391242981, + "logits/rejected": -0.8796356916427612, + "logps/chosen": -318.9521484375, + "logps/rejected": -465.2935485839844, + "loss": 0.5063, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8167131543159485, + "rewards/margins": 1.364931344985962, + "rewards/rejected": -2.1816444396972656, + "step": 640 + }, + { + "epoch": 0.15595009596928983, + "grad_norm": 12.631776767430628, + "learning_rate": 4.952548624265606e-07, + "logits/chosen": -0.9039742350578308, + "logits/rejected": -0.9406811594963074, + "logps/chosen": -403.57562255859375, + "logps/rejected": -478.9420471191406, + "loss": 0.5638, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3750524520874023, + "rewards/margins": 0.8406556248664856, + "rewards/rejected": -2.215708017349243, + "step": 650 + }, + { + "epoch": 0.15834932821497122, + "grad_norm": 6.1260020762155145, + "learning_rate": 4.948402508060607e-07, + "logits/chosen": -1.0212910175323486, + "logits/rejected": -1.0476784706115723, + "logps/chosen": -298.73028564453125, + "logps/rejected": -382.78021240234375, + "loss": 0.542, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6404946446418762, + "rewards/margins": 0.9978824853897095, + "rewards/rejected": -1.6383771896362305, + "step": 660 + }, + { + "epoch": 0.16074856046065258, + "grad_norm": 9.288255831934693, + "learning_rate": 4.944084646147038e-07, + "logits/chosen": -0.869672417640686, + "logits/rejected": -0.9254360198974609, + "logps/chosen": -365.64019775390625, + "logps/rejected": -378.85113525390625, + "loss": 0.5793, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5646086931228638, + "rewards/margins": 0.3554513156414032, + "rewards/rejected": -0.9200600385665894, + "step": 670 + }, + { + "epoch": 0.16314779270633398, + "grad_norm": 12.016810333669639, + "learning_rate": 4.939595341405754e-07, + "logits/chosen": -0.8810294270515442, + "logits/rejected": -0.9110749363899231, + "logps/chosen": -320.2789001464844, + "logps/rejected": -362.8446960449219, + "loss": 0.5238, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.6605560183525085, + "rewards/margins": 0.6321157217025757, + "rewards/rejected": -1.2926716804504395, + "step": 680 + }, + { + "epoch": 0.16554702495201534, + "grad_norm": 9.358664871218739, + "learning_rate": 4.93493490874365e-07, + "logits/chosen": -0.9154292941093445, + "logits/rejected": -0.9222054481506348, + "logps/chosen": -362.84686279296875, + "logps/rejected": -441.8236389160156, + "loss": 0.5367, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.235999345779419, + "rewards/margins": 0.7342005968093872, + "rewards/rejected": -1.9701995849609375, + "step": 690 + }, + { + "epoch": 0.16794625719769674, + "grad_norm": 9.152240736767983, + "learning_rate": 4.93010367507156e-07, + "logits/chosen": -1.0514498949050903, + "logits/rejected": -1.0271055698394775, + "logps/chosen": -301.6610107421875, + "logps/rejected": -385.259521484375, + "loss": 0.5134, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8431800603866577, + "rewards/margins": 1.0639536380767822, + "rewards/rejected": -1.90713369846344, + "step": 700 + }, + { + "epoch": 0.17034548944337813, + "grad_norm": 12.567284708252304, + "learning_rate": 4.925101979281332e-07, + "logits/chosen": -0.9216286540031433, + "logits/rejected": -1.0931814908981323, + "logps/chosen": -433.27130126953125, + "logps/rejected": -560.4276123046875, + "loss": 0.5031, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2827928066253662, + "rewards/margins": 1.7925609350204468, + "rewards/rejected": -3.0753538608551025, + "step": 710 + }, + { + "epoch": 0.1727447216890595, + "grad_norm": 11.741747616486963, + "learning_rate": 4.919930172222054e-07, + "logits/chosen": -0.9665408134460449, + "logits/rejected": -1.0536162853240967, + "logps/chosen": -417.403076171875, + "logps/rejected": -570.5277099609375, + "loss": 0.5003, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7012180089950562, + "rewards/margins": 1.562795877456665, + "rewards/rejected": -3.2640137672424316, + "step": 720 + }, + { + "epoch": 0.1751439539347409, + "grad_norm": 11.733941979044587, + "learning_rate": 4.914588616675445e-07, + "logits/chosen": -1.0246481895446777, + "logits/rejected": -1.0158023834228516, + "logps/chosen": -348.80133056640625, + "logps/rejected": -439.29608154296875, + "loss": 0.5572, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0424585342407227, + "rewards/margins": 1.09738290309906, + "rewards/rejected": -2.1398415565490723, + "step": 730 + }, + { + "epoch": 0.17754318618042225, + "grad_norm": 8.84337454037855, + "learning_rate": 4.909077687330404e-07, + "logits/chosen": -0.8301714658737183, + "logits/rejected": -0.9135535359382629, + "logps/chosen": -330.27264404296875, + "logps/rejected": -350.2643127441406, + "loss": 0.5051, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6734127402305603, + "rewards/margins": 0.41529732942581177, + "rewards/rejected": -1.088710069656372, + "step": 740 + }, + { + "epoch": 0.17994241842610365, + "grad_norm": 8.880233466088285, + "learning_rate": 4.903397770756729e-07, + "logits/chosen": -1.0296955108642578, + "logits/rejected": -1.0950233936309814, + "logps/chosen": -355.98968505859375, + "logps/rejected": -452.5968322753906, + "loss": 0.5248, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9854777455329895, + "rewards/margins": 0.9587169885635376, + "rewards/rejected": -1.9441944360733032, + "step": 750 + }, + { + "epoch": 0.18234165067178504, + "grad_norm": 10.737134261298277, + "learning_rate": 4.897549265378004e-07, + "logits/chosen": -0.9651594161987305, + "logits/rejected": -1.0333675146102905, + "logps/chosen": -473.6534729003906, + "logps/rejected": -615.0003662109375, + "loss": 0.4966, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.580303430557251, + "rewards/margins": 1.4406054019927979, + "rewards/rejected": -3.020908832550049, + "step": 760 + }, + { + "epoch": 0.1847408829174664, + "grad_norm": 10.188381001048064, + "learning_rate": 4.891532581443643e-07, + "logits/chosen": -1.1801836490631104, + "logits/rejected": -1.2248878479003906, + "logps/chosen": -438.201171875, + "logps/rejected": -577.3571166992188, + "loss": 0.4856, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.3580670356750488, + "rewards/margins": 1.4186818599700928, + "rewards/rejected": -2.7767486572265625, + "step": 770 + }, + { + "epoch": 0.1871401151631478, + "grad_norm": 13.185729895925714, + "learning_rate": 4.885348141000122e-07, + "logits/chosen": -1.06368887424469, + "logits/rejected": -1.0469920635223389, + "logps/chosen": -363.1687316894531, + "logps/rejected": -502.82684326171875, + "loss": 0.4839, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2150341272354126, + "rewards/margins": 1.3010752201080322, + "rewards/rejected": -2.5161094665527344, + "step": 780 + }, + { + "epoch": 0.18953934740882916, + "grad_norm": 13.52232678239933, + "learning_rate": 4.878996377861367e-07, + "logits/chosen": -1.0281708240509033, + "logits/rejected": -1.094001054763794, + "logps/chosen": -321.34814453125, + "logps/rejected": -453.59417724609375, + "loss": 0.5211, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.0169535875320435, + "rewards/margins": 1.2799100875854492, + "rewards/rejected": -2.2968640327453613, + "step": 790 + }, + { + "epoch": 0.19193857965451055, + "grad_norm": 8.497366103850682, + "learning_rate": 4.872477737578327e-07, + "logits/chosen": -1.005568504333496, + "logits/rejected": -0.9401241540908813, + "logps/chosen": -398.41546630859375, + "logps/rejected": -612.4130249023438, + "loss": 0.4468, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.1582279205322266, + "rewards/margins": 2.2199604511260986, + "rewards/rejected": -3.378188371658325, + "step": 800 + }, + { + "epoch": 0.19433781190019195, + "grad_norm": 17.573875043998388, + "learning_rate": 4.865792677407718e-07, + "logits/chosen": -1.0809205770492554, + "logits/rejected": -1.1499931812286377, + "logps/chosen": -387.76202392578125, + "logps/rejected": -490.18243408203125, + "loss": 0.5666, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.46564781665802, + "rewards/margins": 1.2134864330291748, + "rewards/rejected": -2.6791341304779053, + "step": 810 + }, + { + "epoch": 0.1967370441458733, + "grad_norm": 8.204522849107846, + "learning_rate": 4.858941666279955e-07, + "logits/chosen": -0.8947283029556274, + "logits/rejected": -0.9740638732910156, + "logps/chosen": -393.2839660644531, + "logps/rejected": -441.0948791503906, + "loss": 0.5435, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2646175622940063, + "rewards/margins": 0.5960845947265625, + "rewards/rejected": -1.8607019186019897, + "step": 820 + }, + { + "epoch": 0.1991362763915547, + "grad_norm": 10.142292221516385, + "learning_rate": 4.851925184766247e-07, + "logits/chosen": -1.053379774093628, + "logits/rejected": -1.1101844310760498, + "logps/chosen": -351.99090576171875, + "logps/rejected": -449.01751708984375, + "loss": 0.4937, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.9777849316596985, + "rewards/margins": 1.14426851272583, + "rewards/rejected": -2.122053623199463, + "step": 830 + }, + { + "epoch": 0.20153550863723607, + "grad_norm": 12.034416836065418, + "learning_rate": 4.844743725044897e-07, + "logits/chosen": -1.0946062803268433, + "logits/rejected": -1.2976312637329102, + "logps/chosen": -378.5492248535156, + "logps/rejected": -474.7021484375, + "loss": 0.5084, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2278741598129272, + "rewards/margins": 1.197396159172058, + "rewards/rejected": -2.4252700805664062, + "step": 840 + }, + { + "epoch": 0.20393474088291746, + "grad_norm": 8.290745524509466, + "learning_rate": 4.837397790866774e-07, + "logits/chosen": -1.2077043056488037, + "logits/rejected": -1.2064851522445679, + "logps/chosen": -398.6294250488281, + "logps/rejected": -532.10009765625, + "loss": 0.5617, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.0644519329071045, + "rewards/margins": 1.5777161121368408, + "rewards/rejected": -2.642167806625366, + "step": 850 + }, + { + "epoch": 0.20633397312859886, + "grad_norm": 9.121636380617016, + "learning_rate": 4.829887897519974e-07, + "logits/chosen": -1.2348374128341675, + "logits/rejected": -1.2065662145614624, + "logps/chosen": -323.52520751953125, + "logps/rejected": -456.983642578125, + "loss": 0.5042, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9337062835693359, + "rewards/margins": 1.160902500152588, + "rewards/rejected": -2.094609022140503, + "step": 860 + }, + { + "epoch": 0.20873320537428022, + "grad_norm": 10.124524555819926, + "learning_rate": 4.82221457179368e-07, + "logits/chosen": -1.2232173681259155, + "logits/rejected": -1.2059452533721924, + "logps/chosen": -376.4814147949219, + "logps/rejected": -523.2281494140625, + "loss": 0.4462, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.0080255270004272, + "rewards/margins": 1.6320230960845947, + "rewards/rejected": -2.6400485038757324, + "step": 870 + }, + { + "epoch": 0.21113243761996162, + "grad_norm": 15.383829882801075, + "learning_rate": 4.814378351941206e-07, + "logits/chosen": -1.1030246019363403, + "logits/rejected": -1.165531873703003, + "logps/chosen": -376.2377014160156, + "logps/rejected": -447.36871337890625, + "loss": 0.5179, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1712180376052856, + "rewards/margins": 0.8365498781204224, + "rewards/rejected": -2.007767915725708, + "step": 880 + }, + { + "epoch": 0.21353166986564298, + "grad_norm": 8.924221297687437, + "learning_rate": 4.806379787642241e-07, + "logits/chosen": -1.13001549243927, + "logits/rejected": -1.117497205734253, + "logps/chosen": -358.25286865234375, + "logps/rejected": -490.5638732910156, + "loss": 0.5141, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0755687952041626, + "rewards/margins": 1.3798635005950928, + "rewards/rejected": -2.4554319381713867, + "step": 890 + }, + { + "epoch": 0.21593090211132437, + "grad_norm": 9.091481860281236, + "learning_rate": 4.798219439964293e-07, + "logits/chosen": -1.1416652202606201, + "logits/rejected": -1.2265089750289917, + "logps/chosen": -366.56915283203125, + "logps/rejected": -414.20721435546875, + "loss": 0.476, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.119057059288025, + "rewards/margins": 0.3352181911468506, + "rewards/rejected": -1.454275131225586, + "step": 900 + }, + { + "epoch": 0.21833013435700577, + "grad_norm": 17.10074312802646, + "learning_rate": 4.78989788132333e-07, + "logits/chosen": -1.1152770519256592, + "logits/rejected": -1.1071698665618896, + "logps/chosen": -342.0806884765625, + "logps/rejected": -534.8381958007812, + "loss": 0.4536, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.19199538230896, + "rewards/margins": 1.8661339282989502, + "rewards/rejected": -3.0581297874450684, + "step": 910 + }, + { + "epoch": 0.22072936660268713, + "grad_norm": 9.85354436821844, + "learning_rate": 4.781415695443631e-07, + "logits/chosen": -1.1570137739181519, + "logits/rejected": -1.1956650018692017, + "logps/chosen": -509.591552734375, + "logps/rejected": -693.6530151367188, + "loss": 0.5105, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.156250476837158, + "rewards/margins": 1.8081929683685303, + "rewards/rejected": -3.9644439220428467, + "step": 920 + }, + { + "epoch": 0.22312859884836853, + "grad_norm": 10.93257575281564, + "learning_rate": 4.772773477316836e-07, + "logits/chosen": -1.0778967142105103, + "logits/rejected": -1.133569598197937, + "logps/chosen": -377.20037841796875, + "logps/rejected": -475.40997314453125, + "loss": 0.4849, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1448915004730225, + "rewards/margins": 0.9261860847473145, + "rewards/rejected": -2.071077823638916, + "step": 930 + }, + { + "epoch": 0.2255278310940499, + "grad_norm": 13.493444996052222, + "learning_rate": 4.7639718331602117e-07, + "logits/chosen": -1.0905169248580933, + "logits/rejected": -1.1160600185394287, + "logps/chosen": -434.937744140625, + "logps/rejected": -650.6595458984375, + "loss": 0.501, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5431007146835327, + "rewards/margins": 2.3286290168762207, + "rewards/rejected": -3.871730327606201, + "step": 940 + }, + { + "epoch": 0.22792706333973128, + "grad_norm": 17.479671128658037, + "learning_rate": 4.7550113803741275e-07, + "logits/chosen": -1.1507641077041626, + "logits/rejected": -1.2998677492141724, + "logps/chosen": -448.26251220703125, + "logps/rejected": -497.29779052734375, + "loss": 0.4886, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5616766214370728, + "rewards/margins": 1.1937475204467773, + "rewards/rejected": -2.7554240226745605, + "step": 950 + }, + { + "epoch": 0.23032629558541268, + "grad_norm": 17.248167606427355, + "learning_rate": 4.7458927474987454e-07, + "logits/chosen": -1.0716644525527954, + "logits/rejected": -1.1447921991348267, + "logps/chosen": -426.6358947753906, + "logps/rejected": -470.9082946777344, + "loss": 0.4858, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1936933994293213, + "rewards/margins": 1.0052495002746582, + "rewards/rejected": -2.1989428997039795, + "step": 960 + }, + { + "epoch": 0.23272552783109404, + "grad_norm": 14.729008337765277, + "learning_rate": 4.7366165741699347e-07, + "logits/chosen": -0.9885573387145996, + "logits/rejected": -1.0534632205963135, + "logps/chosen": -458.31378173828125, + "logps/rejected": -540.1591186523438, + "loss": 0.4747, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3210374116897583, + "rewards/margins": 1.1498852968215942, + "rewards/rejected": -2.4709229469299316, + "step": 970 + }, + { + "epoch": 0.23512476007677544, + "grad_norm": 15.150583413082405, + "learning_rate": 4.727183511074401e-07, + "logits/chosen": -1.309410810470581, + "logits/rejected": -1.3342196941375732, + "logps/chosen": -416.07293701171875, + "logps/rejected": -466.21923828125, + "loss": 0.4945, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2828443050384521, + "rewards/margins": 0.6490219235420227, + "rewards/rejected": -1.9318662881851196, + "step": 980 + }, + { + "epoch": 0.2375239923224568, + "grad_norm": 11.648714128278801, + "learning_rate": 4.717594219904043e-07, + "logits/chosen": -1.0633580684661865, + "logits/rejected": -1.179321527481079, + "logps/chosen": -393.45648193359375, + "logps/rejected": -493.12603759765625, + "loss": 0.5057, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2883937358856201, + "rewards/margins": 1.364997148513794, + "rewards/rejected": -2.653390884399414, + "step": 990 + }, + { + "epoch": 0.2399232245681382, + "grad_norm": 10.483921422479957, + "learning_rate": 4.7078493733095393e-07, + "logits/chosen": -1.0601375102996826, + "logits/rejected": -1.1300022602081299, + "logps/chosen": -432.314208984375, + "logps/rejected": -605.5133056640625, + "loss": 0.4784, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7640736103057861, + "rewards/margins": 1.7046064138412476, + "rewards/rejected": -3.4686806201934814, + "step": 1000 + }, + { + "epoch": 0.2423224568138196, + "grad_norm": 17.060662316036694, + "learning_rate": 4.6979496548531614e-07, + "logits/chosen": -1.243939757347107, + "logits/rejected": -1.223625898361206, + "logps/chosen": -446.4366760253906, + "logps/rejected": -638.4450073242188, + "loss": 0.5126, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8022911548614502, + "rewards/margins": 1.4770129919052124, + "rewards/rejected": -3.279303789138794, + "step": 1010 + }, + { + "epoch": 0.24472168905950095, + "grad_norm": 12.903823112622515, + "learning_rate": 4.6878957589608293e-07, + "logits/chosen": -1.1194379329681396, + "logits/rejected": -1.0698894262313843, + "logps/chosen": -409.68603515625, + "logps/rejected": -603.4207763671875, + "loss": 0.5214, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4294629096984863, + "rewards/margins": 1.5245181322097778, + "rewards/rejected": -2.9539809226989746, + "step": 1020 + }, + { + "epoch": 0.24712092130518235, + "grad_norm": 10.384415695069938, + "learning_rate": 4.6776883908733956e-07, + "logits/chosen": -1.1999337673187256, + "logits/rejected": -1.311231255531311, + "logps/chosen": -397.4037780761719, + "logps/rejected": -460.95269775390625, + "loss": 0.4774, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.9981710314750671, + "rewards/margins": 1.341308832168579, + "rewards/rejected": -2.339479684829712, + "step": 1030 + }, + { + "epoch": 0.2495201535508637, + "grad_norm": 14.761052654024631, + "learning_rate": 4.667328266597178e-07, + "logits/chosen": -1.1137980222702026, + "logits/rejected": -1.1541672945022583, + "logps/chosen": -391.23504638671875, + "logps/rejected": -516.2595825195312, + "loss": 0.4633, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3899133205413818, + "rewards/margins": 1.2868678569793701, + "rewards/rejected": -2.676781177520752, + "step": 1040 + }, + { + "epoch": 0.2519193857965451, + "grad_norm": 10.417201545289524, + "learning_rate": 4.6568161128537354e-07, + "logits/chosen": -1.0899343490600586, + "logits/rejected": -1.247287631034851, + "logps/chosen": -420.63836669921875, + "logps/rejected": -514.6077270507812, + "loss": 0.4964, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.5773645639419556, + "rewards/margins": 1.506489872932434, + "rewards/rejected": -3.0838541984558105, + "step": 1050 + }, + { + "epoch": 0.2543186180422265, + "grad_norm": 16.488420937432345, + "learning_rate": 4.6461526670288877e-07, + "logits/chosen": -1.1203404664993286, + "logits/rejected": -1.1449543237686157, + "logps/chosen": -407.5829162597656, + "logps/rejected": -497.4512634277344, + "loss": 0.493, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3892755508422852, + "rewards/margins": 1.1714465618133545, + "rewards/rejected": -2.5607221126556396, + "step": 1060 + }, + { + "epoch": 0.2567178502879079, + "grad_norm": 9.466776525081485, + "learning_rate": 4.635338677120994e-07, + "logits/chosen": -1.3948705196380615, + "logits/rejected": -1.3905651569366455, + "logps/chosen": -379.10009765625, + "logps/rejected": -558.11328125, + "loss": 0.4566, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.116693139076233, + "rewards/margins": 1.6451349258422852, + "rewards/rejected": -2.7618279457092285, + "step": 1070 + }, + { + "epoch": 0.2591170825335892, + "grad_norm": 14.2505130371337, + "learning_rate": 4.6243749016884835e-07, + "logits/chosen": -1.1959508657455444, + "logits/rejected": -1.2822418212890625, + "logps/chosen": -459.30999755859375, + "logps/rejected": -783.4131469726562, + "loss": 0.5047, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8917354345321655, + "rewards/margins": 2.781670331954956, + "rewards/rejected": -4.673405647277832, + "step": 1080 + }, + { + "epoch": 0.2615163147792706, + "grad_norm": 16.476931783493725, + "learning_rate": 4.613262109796645e-07, + "logits/chosen": -1.2336928844451904, + "logits/rejected": -1.1787619590759277, + "logps/chosen": -439.15838623046875, + "logps/rejected": -729.7471313476562, + "loss": 0.4727, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9186598062515259, + "rewards/margins": 2.601553440093994, + "rewards/rejected": -4.5202131271362305, + "step": 1090 + }, + { + "epoch": 0.263915547024952, + "grad_norm": 12.359629943883897, + "learning_rate": 4.602001080963678e-07, + "logits/chosen": -1.1227662563323975, + "logits/rejected": -1.1912363767623901, + "logps/chosen": -392.65057373046875, + "logps/rejected": -619.9832763671875, + "loss": 0.4556, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2552802562713623, + "rewards/margins": 2.460552215576172, + "rewards/rejected": -3.715832233428955, + "step": 1100 + }, + { + "epoch": 0.2663147792706334, + "grad_norm": 19.000478036791087, + "learning_rate": 4.590592605106017e-07, + "logits/chosen": -1.1630818843841553, + "logits/rejected": -1.2058923244476318, + "logps/chosen": -423.0645446777344, + "logps/rejected": -619.4017333984375, + "loss": 0.5008, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3737356662750244, + "rewards/margins": 2.270371198654175, + "rewards/rejected": -3.6441073417663574, + "step": 1110 + }, + { + "epoch": 0.2687140115163148, + "grad_norm": 11.988638837694287, + "learning_rate": 4.5790374824829165e-07, + "logits/chosen": -1.1366350650787354, + "logits/rejected": -1.2177644968032837, + "logps/chosen": -313.95233154296875, + "logps/rejected": -553.4268798828125, + "loss": 0.5165, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.2247995138168335, + "rewards/margins": 2.2755985260009766, + "rewards/rejected": -3.5003979206085205, + "step": 1120 + }, + { + "epoch": 0.27111324376199614, + "grad_norm": 13.389751916846311, + "learning_rate": 4.5673365236403216e-07, + "logits/chosen": -1.1376798152923584, + "logits/rejected": -1.2225024700164795, + "logps/chosen": -418.84930419921875, + "logps/rejected": -644.3150024414062, + "loss": 0.4922, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1243631839752197, + "rewards/margins": 1.9992843866348267, + "rewards/rejected": -4.123647212982178, + "step": 1130 + }, + { + "epoch": 0.27351247600767753, + "grad_norm": 12.066478642007997, + "learning_rate": 4.5554905493540075e-07, + "logits/chosen": -1.393936038017273, + "logits/rejected": -1.4359791278839111, + "logps/chosen": -366.07171630859375, + "logps/rejected": -646.8983154296875, + "loss": 0.4248, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4785449504852295, + "rewards/margins": 2.75530743598938, + "rewards/rejected": -4.233852386474609, + "step": 1140 + }, + { + "epoch": 0.2759117082533589, + "grad_norm": 13.594931228595337, + "learning_rate": 4.5435003905720074e-07, + "logits/chosen": -1.3401994705200195, + "logits/rejected": -1.4189189672470093, + "logps/chosen": -555.1502685546875, + "logps/rejected": -733.55908203125, + "loss": 0.4941, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.782365560531616, + "rewards/margins": 2.0489964485168457, + "rewards/rejected": -4.831361770629883, + "step": 1150 + }, + { + "epoch": 0.2783109404990403, + "grad_norm": 10.959828441128586, + "learning_rate": 4.531366888356324e-07, + "logits/chosen": -1.3572887182235718, + "logits/rejected": -1.316543459892273, + "logps/chosen": -379.75048828125, + "logps/rejected": -709.0888061523438, + "loss": 0.421, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.861644983291626, + "rewards/margins": 2.909771680831909, + "rewards/rejected": -4.771416664123535, + "step": 1160 + }, + { + "epoch": 0.2807101727447217, + "grad_norm": 11.9806643935228, + "learning_rate": 4.519090893823931e-07, + "logits/chosen": -1.2535573244094849, + "logits/rejected": -1.2999234199523926, + "logps/chosen": -466.22979736328125, + "logps/rejected": -587.1083984375, + "loss": 0.4706, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.177558422088623, + "rewards/margins": 1.3151285648345947, + "rewards/rejected": -3.4926867485046387, + "step": 1170 + }, + { + "epoch": 0.28310940499040305, + "grad_norm": 15.64676518237564, + "learning_rate": 4.5066732680870734e-07, + "logits/chosen": -1.1491421461105347, + "logits/rejected": -1.2824984788894653, + "logps/chosen": -410.51129150390625, + "logps/rejected": -574.32861328125, + "loss": 0.4552, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5698862075805664, + "rewards/margins": 2.0116872787475586, + "rewards/rejected": -3.581573486328125, + "step": 1180 + }, + { + "epoch": 0.28550863723608444, + "grad_norm": 14.866463710844934, + "learning_rate": 4.494114882192862e-07, + "logits/chosen": -1.1980191469192505, + "logits/rejected": -1.2435463666915894, + "logps/chosen": -425.57733154296875, + "logps/rejected": -676.6328735351562, + "loss": 0.456, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.72724187374115, + "rewards/margins": 2.7374391555786133, + "rewards/rejected": -4.4646806716918945, + "step": 1190 + }, + { + "epoch": 0.28790786948176583, + "grad_norm": 13.193324388633975, + "learning_rate": 4.4814166170621735e-07, + "logits/chosen": -1.4249297380447388, + "logits/rejected": -1.497604489326477, + "logps/chosen": -480.2257385253906, + "logps/rejected": -622.1727294921875, + "loss": 0.4656, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2823410034179688, + "rewards/margins": 1.655609130859375, + "rewards/rejected": -3.9379496574401855, + "step": 1200 + }, + { + "epoch": 0.2903071017274472, + "grad_norm": 20.768443614327058, + "learning_rate": 4.468579363427858e-07, + "logits/chosen": -1.2781771421432495, + "logits/rejected": -1.3352419137954712, + "logps/chosen": -420.2730407714844, + "logps/rejected": -640.3319091796875, + "loss": 0.4615, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5896456241607666, + "rewards/margins": 2.4936366081237793, + "rewards/rejected": -4.083281517028809, + "step": 1210 + }, + { + "epoch": 0.2927063339731286, + "grad_norm": 9.971205156379035, + "learning_rate": 4.4556040217722555e-07, + "logits/chosen": -1.236127257347107, + "logits/rejected": -1.2078077793121338, + "logps/chosen": -356.2068786621094, + "logps/rejected": -533.8853759765625, + "loss": 0.4674, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1397576332092285, + "rewards/margins": 1.5811337232589722, + "rewards/rejected": -2.7208914756774902, + "step": 1220 + }, + { + "epoch": 0.29510556621880996, + "grad_norm": 12.163666083646278, + "learning_rate": 4.442491502264033e-07, + "logits/chosen": -1.1941020488739014, + "logits/rejected": -1.2168632745742798, + "logps/chosen": -363.7603454589844, + "logps/rejected": -455.57867431640625, + "loss": 0.4612, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3447378873825073, + "rewards/margins": 1.0612128973007202, + "rewards/rejected": -2.4059505462646484, + "step": 1230 + }, + { + "epoch": 0.29750479846449135, + "grad_norm": 9.273351878722064, + "learning_rate": 4.429242724694338e-07, + "logits/chosen": -1.3338253498077393, + "logits/rejected": -1.336753010749817, + "logps/chosen": -395.6485595703125, + "logps/rejected": -632.2241821289062, + "loss": 0.454, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4914841651916504, + "rewards/margins": 2.230437994003296, + "rewards/rejected": -3.7219223976135254, + "step": 1240 + }, + { + "epoch": 0.29990403071017274, + "grad_norm": 25.05777651266455, + "learning_rate": 4.4158586184122817e-07, + "logits/chosen": -1.1749566793441772, + "logits/rejected": -1.271209478378296, + "logps/chosen": -429.14306640625, + "logps/rejected": -604.9749755859375, + "loss": 0.4598, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.3869397640228271, + "rewards/margins": 2.094210386276245, + "rewards/rejected": -3.4811503887176514, + "step": 1250 + }, + { + "epoch": 0.30230326295585414, + "grad_norm": 17.369219336254258, + "learning_rate": 4.4023401222597443e-07, + "logits/chosen": -1.027753233909607, + "logits/rejected": -1.1785060167312622, + "logps/chosen": -425.7173767089844, + "logps/rejected": -544.3839721679688, + "loss": 0.4667, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5711647272109985, + "rewards/margins": 1.3786556720733643, + "rewards/rejected": -2.9498205184936523, + "step": 1260 + }, + { + "epoch": 0.30470249520153553, + "grad_norm": 11.861504310327616, + "learning_rate": 4.3886881845055235e-07, + "logits/chosen": -1.1897116899490356, + "logits/rejected": -1.299993872642517, + "logps/chosen": -375.5953674316406, + "logps/rejected": -659.8858032226562, + "loss": 0.4618, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3118809461593628, + "rewards/margins": 2.890842914581299, + "rewards/rejected": -4.202723979949951, + "step": 1270 + }, + { + "epoch": 0.30710172744721687, + "grad_norm": 10.8218616423801, + "learning_rate": 4.374903762778814e-07, + "logits/chosen": -1.3571842908859253, + "logits/rejected": -1.385545253753662, + "logps/chosen": -494.83709716796875, + "logps/rejected": -658.3067016601562, + "loss": 0.4837, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.376838445663452, + "rewards/margins": 1.8850828409194946, + "rewards/rejected": -4.261920928955078, + "step": 1280 + }, + { + "epoch": 0.30950095969289826, + "grad_norm": 10.14042165637657, + "learning_rate": 4.3609878240020356e-07, + "logits/chosen": -1.2036101818084717, + "logits/rejected": -1.3167588710784912, + "logps/chosen": -486.25811767578125, + "logps/rejected": -651.5155639648438, + "loss": 0.4531, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9182531833648682, + "rewards/margins": 2.2668986320495605, + "rewards/rejected": -4.18515157699585, + "step": 1290 + }, + { + "epoch": 0.31190019193857965, + "grad_norm": 11.299969604518076, + "learning_rate": 4.346941344323005e-07, + "logits/chosen": -1.376947045326233, + "logits/rejected": -1.4665632247924805, + "logps/chosen": -451.6893615722656, + "logps/rejected": -499.0320739746094, + "loss": 0.4913, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.929490089416504, + "rewards/margins": 1.0548813343048096, + "rewards/rejected": -2.9843716621398926, + "step": 1300 + }, + { + "epoch": 0.31429942418426104, + "grad_norm": 11.980383531649544, + "learning_rate": 4.332765309046467e-07, + "logits/chosen": -1.332617998123169, + "logits/rejected": -1.3639962673187256, + "logps/chosen": -426.499755859375, + "logps/rejected": -626.6729736328125, + "loss": 0.474, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.678342580795288, + "rewards/margins": 2.378535747528076, + "rewards/rejected": -4.056878566741943, + "step": 1310 + }, + { + "epoch": 0.31669865642994244, + "grad_norm": 14.013248425816212, + "learning_rate": 4.3184607125649754e-07, + "logits/chosen": -1.2517986297607422, + "logits/rejected": -1.3113311529159546, + "logps/chosen": -414.6436462402344, + "logps/rejected": -657.0100708007812, + "loss": 0.4847, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3406541347503662, + "rewards/margins": 2.270404815673828, + "rewards/rejected": -3.6110591888427734, + "step": 1320 + }, + { + "epoch": 0.3190978886756238, + "grad_norm": 15.333659280580797, + "learning_rate": 4.304028558289141e-07, + "logits/chosen": -1.4721230268478394, + "logits/rejected": -1.506519079208374, + "logps/chosen": -451.8147888183594, + "logps/rejected": -667.3796997070312, + "loss": 0.448, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6051162481307983, + "rewards/margins": 2.3636975288391113, + "rewards/rejected": -3.96881365776062, + "step": 1330 + }, + { + "epoch": 0.32149712092130517, + "grad_norm": 12.587601683578118, + "learning_rate": 4.28946985857725e-07, + "logits/chosen": -1.5413029193878174, + "logits/rejected": -1.5675899982452393, + "logps/chosen": -508.3623046875, + "logps/rejected": -787.8271484375, + "loss": 0.4284, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.329979658126831, + "rewards/margins": 2.8664588928222656, + "rewards/rejected": -5.196438789367676, + "step": 1340 + }, + { + "epoch": 0.32389635316698656, + "grad_norm": 11.206457061422094, + "learning_rate": 4.2747856346642445e-07, + "logits/chosen": -1.1610701084136963, + "logits/rejected": -1.1690763235092163, + "logps/chosen": -401.5397033691406, + "logps/rejected": -605.7808227539062, + "loss": 0.4051, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7380218505859375, + "rewards/margins": 2.2142810821533203, + "rewards/rejected": -3.952302932739258, + "step": 1350 + }, + { + "epoch": 0.32629558541266795, + "grad_norm": 20.124180714207323, + "learning_rate": 4.2599769165900933e-07, + "logits/chosen": -1.1436104774475098, + "logits/rejected": -1.193645715713501, + "logps/chosen": -501.67706298828125, + "logps/rejected": -850.97314453125, + "loss": 0.4949, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.594907283782959, + "rewards/margins": 3.6968417167663574, + "rewards/rejected": -6.291749000549316, + "step": 1360 + }, + { + "epoch": 0.32869481765834935, + "grad_norm": 8.824763672957205, + "learning_rate": 4.245044743127535e-07, + "logits/chosen": -1.2460205554962158, + "logits/rejected": -1.1843246221542358, + "logps/chosen": -402.1271667480469, + "logps/rejected": -616.098388671875, + "loss": 0.4785, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.439296007156372, + "rewards/margins": 2.2133936882019043, + "rewards/rejected": -3.6526896953582764, + "step": 1370 + }, + { + "epoch": 0.3310940499040307, + "grad_norm": 14.598877417461923, + "learning_rate": 4.229990161709214e-07, + "logits/chosen": -1.2217421531677246, + "logits/rejected": -1.130197286605835, + "logps/chosen": -352.7733459472656, + "logps/rejected": -632.5406494140625, + "loss": 0.463, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.2021641731262207, + "rewards/margins": 2.610610246658325, + "rewards/rejected": -3.8127739429473877, + "step": 1380 + }, + { + "epoch": 0.3334932821497121, + "grad_norm": 11.865002342507843, + "learning_rate": 4.214814228354204e-07, + "logits/chosen": -1.342158555984497, + "logits/rejected": -1.390978217124939, + "logps/chosen": -457.16705322265625, + "logps/rejected": -833.06787109375, + "loss": 0.4364, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.8238433599472046, + "rewards/margins": 3.916942596435547, + "rewards/rejected": -5.740786075592041, + "step": 1390 + }, + { + "epoch": 0.33589251439539347, + "grad_norm": 12.72331809654516, + "learning_rate": 4.1995180075939375e-07, + "logits/chosen": -1.4785504341125488, + "logits/rejected": -1.4822914600372314, + "logps/chosen": -445.6946716308594, + "logps/rejected": -639.58837890625, + "loss": 0.4643, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7562789916992188, + "rewards/margins": 2.1691954135894775, + "rewards/rejected": -3.925474166870117, + "step": 1400 + }, + { + "epoch": 0.33829174664107486, + "grad_norm": 11.703724239093889, + "learning_rate": 4.1841025723975297e-07, + "logits/chosen": -1.1503616571426392, + "logits/rejected": -1.1993227005004883, + "logps/chosen": -395.007080078125, + "logps/rejected": -638.5355224609375, + "loss": 0.4344, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1847431659698486, + "rewards/margins": 2.6262497901916504, + "rewards/rejected": -3.810993194580078, + "step": 1410 + }, + { + "epoch": 0.34069097888675626, + "grad_norm": 19.639469762609966, + "learning_rate": 4.168569004096516e-07, + "logits/chosen": -1.2111709117889404, + "logits/rejected": -1.1946176290512085, + "logps/chosen": -405.03558349609375, + "logps/rejected": -638.8396606445312, + "loss": 0.4405, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7199838161468506, + "rewards/margins": 2.166039228439331, + "rewards/rejected": -3.886023759841919, + "step": 1420 + }, + { + "epoch": 0.3430902111324376, + "grad_norm": 12.824816837365054, + "learning_rate": 4.152918392308997e-07, + "logits/chosen": -1.4239578247070312, + "logits/rejected": -1.4245679378509521, + "logps/chosen": -430.7586364746094, + "logps/rejected": -618.7017822265625, + "loss": 0.4354, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7669651508331299, + "rewards/margins": 1.9917770624160767, + "rewards/rejected": -3.758742094039917, + "step": 1430 + }, + { + "epoch": 0.345489443378119, + "grad_norm": 17.48149790816653, + "learning_rate": 4.137151834863213e-07, + "logits/chosen": -1.307308554649353, + "logits/rejected": -1.2468369007110596, + "logps/chosen": -443.11370849609375, + "logps/rejected": -816.6782836914062, + "loss": 0.5016, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0893607139587402, + "rewards/margins": 3.403632640838623, + "rewards/rejected": -5.492993354797363, + "step": 1440 + }, + { + "epoch": 0.3478886756238004, + "grad_norm": 13.277912286181408, + "learning_rate": 4.121270437720526e-07, + "logits/chosen": -1.192663550376892, + "logits/rejected": -1.184259295463562, + "logps/chosen": -402.0312194824219, + "logps/rejected": -540.152587890625, + "loss": 0.4536, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9467941522598267, + "rewards/margins": 0.9955675005912781, + "rewards/rejected": -2.942361354827881, + "step": 1450 + }, + { + "epoch": 0.3502879078694818, + "grad_norm": 12.024084039884944, + "learning_rate": 4.105275314897852e-07, + "logits/chosen": -1.3382481336593628, + "logits/rejected": -1.3211191892623901, + "logps/chosen": -403.6156005859375, + "logps/rejected": -820.3019409179688, + "loss": 0.4487, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7442741394042969, + "rewards/margins": 3.908841609954834, + "rewards/rejected": -5.653115272521973, + "step": 1460 + }, + { + "epoch": 0.35268714011516317, + "grad_norm": 10.938380648082374, + "learning_rate": 4.089167588389508e-07, + "logits/chosen": -1.0116260051727295, + "logits/rejected": -1.1302238702774048, + "logps/chosen": -524.224853515625, + "logps/rejected": -730.9097900390625, + "loss": 0.4594, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8979120254516602, + "rewards/margins": 2.4903345108032227, + "rewards/rejected": -4.388247489929199, + "step": 1470 + }, + { + "epoch": 0.3550863723608445, + "grad_norm": 17.953634346330745, + "learning_rate": 4.072948388088515e-07, + "logits/chosen": -1.1399272680282593, + "logits/rejected": -1.185240387916565, + "logps/chosen": -480.47869873046875, + "logps/rejected": -711.10107421875, + "loss": 0.4749, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0881600379943848, + "rewards/margins": 2.2733359336853027, + "rewards/rejected": -4.3614959716796875, + "step": 1480 + }, + { + "epoch": 0.3574856046065259, + "grad_norm": 15.479182257867892, + "learning_rate": 4.056618851707334e-07, + "logits/chosen": -1.2174913883209229, + "logits/rejected": -1.3078429698944092, + "logps/chosen": -448.73626708984375, + "logps/rejected": -778.0083618164062, + "loss": 0.4081, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7068407535552979, + "rewards/margins": 3.3156890869140625, + "rewards/rejected": -5.022529602050781, + "step": 1490 + }, + { + "epoch": 0.3598848368522073, + "grad_norm": 12.258518175047803, + "learning_rate": 4.0401801246980675e-07, + "logits/chosen": -1.3552016019821167, + "logits/rejected": -1.4195324182510376, + "logps/chosen": -483.03607177734375, + "logps/rejected": -829.8291015625, + "loss": 0.4493, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.638827323913574, + "rewards/margins": 3.691737651824951, + "rewards/rejected": -6.330564498901367, + "step": 1500 + }, + { + "epoch": 0.3622840690978887, + "grad_norm": 12.234674147688299, + "learning_rate": 4.0236333601721043e-07, + "logits/chosen": -1.267978310585022, + "logits/rejected": -1.2512781620025635, + "logps/chosen": -463.58636474609375, + "logps/rejected": -589.7911376953125, + "loss": 0.4873, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8561378717422485, + "rewards/margins": 1.1768932342529297, + "rewards/rejected": -3.0330309867858887, + "step": 1510 + }, + { + "epoch": 0.3646833013435701, + "grad_norm": 14.004595080556923, + "learning_rate": 4.0069797188192364e-07, + "logits/chosen": -1.1959376335144043, + "logits/rejected": -1.1977354288101196, + "logps/chosen": -557.3121337890625, + "logps/rejected": -894.349609375, + "loss": 0.4741, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.671919345855713, + "rewards/margins": 3.7750651836395264, + "rewards/rejected": -6.446984767913818, + "step": 1520 + }, + { + "epoch": 0.3670825335892514, + "grad_norm": 12.107627085595226, + "learning_rate": 3.9902203688262417e-07, + "logits/chosen": -1.2063888311386108, + "logits/rejected": -1.294390082359314, + "logps/chosen": -402.6402282714844, + "logps/rejected": -547.6096801757812, + "loss": 0.4266, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.433250904083252, + "rewards/margins": 1.5607125759124756, + "rewards/rejected": -2.9939634799957275, + "step": 1530 + }, + { + "epoch": 0.3694817658349328, + "grad_norm": 17.020506697194886, + "learning_rate": 3.9733564857949365e-07, + "logits/chosen": -1.2349357604980469, + "logits/rejected": -1.3519177436828613, + "logps/chosen": -479.85992431640625, + "logps/rejected": -652.7718505859375, + "loss": 0.4142, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8144111633300781, + "rewards/margins": 2.1435914039611816, + "rewards/rejected": -3.958002805709839, + "step": 1540 + }, + { + "epoch": 0.3718809980806142, + "grad_norm": 17.340247743555018, + "learning_rate": 3.9563892526597177e-07, + "logits/chosen": -1.342357873916626, + "logits/rejected": -1.2987167835235596, + "logps/chosen": -361.8048400878906, + "logps/rejected": -495.60577392578125, + "loss": 0.4381, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3809950351715088, + "rewards/margins": 0.9643322825431824, + "rewards/rejected": -2.345327138900757, + "step": 1550 + }, + { + "epoch": 0.3742802303262956, + "grad_norm": 12.57132648844664, + "learning_rate": 3.9393198596045795e-07, + "logits/chosen": -1.2646602392196655, + "logits/rejected": -1.227052927017212, + "logps/chosen": -390.56402587890625, + "logps/rejected": -566.3883666992188, + "loss": 0.4795, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.529387354850769, + "rewards/margins": 1.646651029586792, + "rewards/rejected": -3.176038980484009, + "step": 1560 + }, + { + "epoch": 0.376679462571977, + "grad_norm": 9.769186078821475, + "learning_rate": 3.922149503979628e-07, + "logits/chosen": -1.0911178588867188, + "logits/rejected": -1.1452914476394653, + "logps/chosen": -446.0682678222656, + "logps/rejected": -891.4172973632812, + "loss": 0.421, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7286949157714844, + "rewards/margins": 4.412930011749268, + "rewards/rejected": -6.141624927520752, + "step": 1570 + }, + { + "epoch": 0.3790786948176583, + "grad_norm": 15.13387973536505, + "learning_rate": 3.904879390217095e-07, + "logits/chosen": -1.2228879928588867, + "logits/rejected": -1.2863072156906128, + "logps/chosen": -414.58251953125, + "logps/rejected": -585.94189453125, + "loss": 0.4376, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5823562145233154, + "rewards/margins": 1.9467941522598267, + "rewards/rejected": -3.529151201248169, + "step": 1580 + }, + { + "epoch": 0.3814779270633397, + "grad_norm": 15.692773841294583, + "learning_rate": 3.8875107297468463e-07, + "logits/chosen": -1.1607332229614258, + "logits/rejected": -1.1135156154632568, + "logps/chosen": -394.6246032714844, + "logps/rejected": -685.8594970703125, + "loss": 0.4893, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5657716989517212, + "rewards/margins": 2.4296836853027344, + "rewards/rejected": -3.995455503463745, + "step": 1590 + }, + { + "epoch": 0.3838771593090211, + "grad_norm": 12.22013342174461, + "learning_rate": 3.87004474091141e-07, + "logits/chosen": -1.0785077810287476, + "logits/rejected": -1.145101547241211, + "logps/chosen": -388.1426696777344, + "logps/rejected": -562.6093139648438, + "loss": 0.45, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6640561819076538, + "rewards/margins": 1.593210220336914, + "rewards/rejected": -3.2572665214538574, + "step": 1600 + }, + { + "epoch": 0.3862763915547025, + "grad_norm": 12.125863891345588, + "learning_rate": 3.8524826488805114e-07, + "logits/chosen": -1.2912501096725464, + "logits/rejected": -1.281894326210022, + "logps/chosen": -448.44403076171875, + "logps/rejected": -577.4244384765625, + "loss": 0.4996, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6368385553359985, + "rewards/margins": 1.7040477991104126, + "rewards/rejected": -3.3408865928649902, + "step": 1610 + }, + { + "epoch": 0.3886756238003839, + "grad_norm": 14.772144560930354, + "learning_rate": 3.834825685565133e-07, + "logits/chosen": -1.2755136489868164, + "logits/rejected": -1.3778313398361206, + "logps/chosen": -365.7793273925781, + "logps/rejected": -445.6787109375, + "loss": 0.4148, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2803051471710205, + "rewards/margins": 1.1510274410247803, + "rewards/rejected": -2.43133282661438, + "step": 1620 + }, + { + "epoch": 0.39107485604606523, + "grad_norm": 19.441822036099662, + "learning_rate": 3.8170750895311007e-07, + "logits/chosen": -1.168717622756958, + "logits/rejected": -1.1627219915390015, + "logps/chosen": -418.5533752441406, + "logps/rejected": -577.6104736328125, + "loss": 0.4001, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3612974882125854, + "rewards/margins": 1.8237950801849365, + "rewards/rejected": -3.1850924491882324, + "step": 1630 + }, + { + "epoch": 0.3934740882917466, + "grad_norm": 15.432355872695538, + "learning_rate": 3.7992321059122045e-07, + "logits/chosen": -1.1575825214385986, + "logits/rejected": -1.2952228784561157, + "logps/chosen": -471.82476806640625, + "logps/rejected": -670.2379150390625, + "loss": 0.4553, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2020068168640137, + "rewards/margins": 2.1685726642608643, + "rewards/rejected": -4.370579242706299, + "step": 1640 + }, + { + "epoch": 0.395873320537428, + "grad_norm": 12.344277319747519, + "learning_rate": 3.7812979863228576e-07, + "logits/chosen": -1.3181376457214355, + "logits/rejected": -1.3618929386138916, + "logps/chosen": -485.92034912109375, + "logps/rejected": -651.4783935546875, + "loss": 0.4458, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.687350034713745, + "rewards/margins": 1.5618057250976562, + "rewards/rejected": -4.2491559982299805, + "step": 1650 + }, + { + "epoch": 0.3982725527831094, + "grad_norm": 15.718103189453073, + "learning_rate": 3.763273988770296e-07, + "logits/chosen": -1.1789578199386597, + "logits/rejected": -1.2662181854248047, + "logps/chosen": -411.53680419921875, + "logps/rejected": -600.5362548828125, + "loss": 0.4555, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7146021127700806, + "rewards/margins": 1.884450912475586, + "rewards/rejected": -3.599053144454956, + "step": 1660 + }, + { + "epoch": 0.4006717850287908, + "grad_norm": 12.906974103488265, + "learning_rate": 3.7451613775663405e-07, + "logits/chosen": -1.1591131687164307, + "logits/rejected": -1.1079394817352295, + "logps/chosen": -392.81610107421875, + "logps/rejected": -686.8541259765625, + "loss": 0.4617, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5098912715911865, + "rewards/margins": 2.9055066108703613, + "rewards/rejected": -4.415398120880127, + "step": 1670 + }, + { + "epoch": 0.40307101727447214, + "grad_norm": 17.744184430696986, + "learning_rate": 3.726961423238706e-07, + "logits/chosen": -1.2879854440689087, + "logits/rejected": -1.2753901481628418, + "logps/chosen": -382.4233703613281, + "logps/rejected": -630.2597045898438, + "loss": 0.4446, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5524839162826538, + "rewards/margins": 2.3134608268737793, + "rewards/rejected": -3.8659446239471436, + "step": 1680 + }, + { + "epoch": 0.40547024952015354, + "grad_norm": 15.938147948338413, + "learning_rate": 3.708675402441882e-07, + "logits/chosen": -1.146555781364441, + "logits/rejected": -1.3221074342727661, + "logps/chosen": -438.2669372558594, + "logps/rejected": -592.5673217773438, + "loss": 0.4849, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6606773138046265, + "rewards/margins": 1.8763577938079834, + "rewards/rejected": -3.5370349884033203, + "step": 1690 + }, + { + "epoch": 0.40786948176583493, + "grad_norm": 15.397723594978256, + "learning_rate": 3.6903045978675775e-07, + "logits/chosen": -1.1809333562850952, + "logits/rejected": -1.2159626483917236, + "logps/chosen": -393.14300537109375, + "logps/rejected": -634.4575805664062, + "loss": 0.4445, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5431654453277588, + "rewards/margins": 2.6182663440704346, + "rewards/rejected": -4.161431789398193, + "step": 1700 + }, + { + "epoch": 0.4102687140115163, + "grad_norm": 12.520789981032921, + "learning_rate": 3.6718502981547474e-07, + "logits/chosen": -1.2585941553115845, + "logits/rejected": -1.2499480247497559, + "logps/chosen": -395.173828125, + "logps/rejected": -598.7399291992188, + "loss": 0.4262, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.478262186050415, + "rewards/margins": 1.6322393417358398, + "rewards/rejected": -3.110501527786255, + "step": 1710 + }, + { + "epoch": 0.4126679462571977, + "grad_norm": 14.28113094156497, + "learning_rate": 3.6533137977991986e-07, + "logits/chosen": -1.1053855419158936, + "logits/rejected": -1.1341431140899658, + "logps/chosen": -424.7064514160156, + "logps/rejected": -587.2310791015625, + "loss": 0.5011, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4273463487625122, + "rewards/margins": 1.3417994976043701, + "rewards/rejected": -2.769145965576172, + "step": 1720 + }, + { + "epoch": 0.41506717850287905, + "grad_norm": 11.265817004608927, + "learning_rate": 3.6346963970627865e-07, + "logits/chosen": -1.1037083864212036, + "logits/rejected": -1.0224764347076416, + "logps/chosen": -393.5522766113281, + "logps/rejected": -614.2374877929688, + "loss": 0.4456, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5364679098129272, + "rewards/margins": 2.105459690093994, + "rewards/rejected": -3.641927719116211, + "step": 1730 + }, + { + "epoch": 0.41746641074856045, + "grad_norm": 11.977913591571605, + "learning_rate": 3.615999401882207e-07, + "logits/chosen": -1.3552181720733643, + "logits/rejected": -1.3371044397354126, + "logps/chosen": -418.01397705078125, + "logps/rejected": -775.3851318359375, + "loss": 0.453, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8135935068130493, + "rewards/margins": 3.5035622119903564, + "rewards/rejected": -5.317155361175537, + "step": 1740 + }, + { + "epoch": 0.41986564299424184, + "grad_norm": 11.503365691015834, + "learning_rate": 3.597224123777389e-07, + "logits/chosen": -1.2816386222839355, + "logits/rejected": -1.2878687381744385, + "logps/chosen": -511.79852294921875, + "logps/rejected": -887.9742431640625, + "loss": 0.447, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.595597743988037, + "rewards/margins": 3.664611339569092, + "rewards/rejected": -6.260209083557129, + "step": 1750 + }, + { + "epoch": 0.42226487523992323, + "grad_norm": 16.17377166072833, + "learning_rate": 3.5783718797595e-07, + "logits/chosen": -1.2984836101531982, + "logits/rejected": -1.399364709854126, + "logps/chosen": -505.53204345703125, + "logps/rejected": -702.7299194335938, + "loss": 0.4559, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.0332143306732178, + "rewards/margins": 2.416463851928711, + "rewards/rejected": -4.449678421020508, + "step": 1760 + }, + { + "epoch": 0.4246641074856046, + "grad_norm": 12.49839489285334, + "learning_rate": 3.559443992238558e-07, + "logits/chosen": -1.3506227731704712, + "logits/rejected": -1.409407615661621, + "logps/chosen": -397.6769714355469, + "logps/rejected": -840.98095703125, + "loss": 0.4406, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.526439905166626, + "rewards/margins": 4.196308135986328, + "rewards/rejected": -5.722747802734375, + "step": 1770 + }, + { + "epoch": 0.42706333973128596, + "grad_norm": 10.138858801029569, + "learning_rate": 3.540441788930673e-07, + "logits/chosen": -1.3410276174545288, + "logits/rejected": -1.414222002029419, + "logps/chosen": -491.10986328125, + "logps/rejected": -747.819091796875, + "loss": 0.4169, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9301306009292603, + "rewards/margins": 2.9560461044311523, + "rewards/rejected": -4.886176109313965, + "step": 1780 + }, + { + "epoch": 0.42946257197696736, + "grad_norm": 14.446911222851618, + "learning_rate": 3.5213666027649123e-07, + "logits/chosen": -1.3879473209381104, + "logits/rejected": -1.5012261867523193, + "logps/chosen": -495.0065002441406, + "logps/rejected": -607.099853515625, + "loss": 0.4593, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.185396194458008, + "rewards/margins": 1.5175437927246094, + "rewards/rejected": -3.702939510345459, + "step": 1790 + }, + { + "epoch": 0.43186180422264875, + "grad_norm": 15.239290825165414, + "learning_rate": 3.5022197717898017e-07, + "logits/chosen": -1.2657089233398438, + "logits/rejected": -1.4074336290359497, + "logps/chosen": -394.63385009765625, + "logps/rejected": -730.5569458007812, + "loss": 0.3917, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7406047582626343, + "rewards/margins": 3.7480709552764893, + "rewards/rejected": -5.488675594329834, + "step": 1800 + }, + { + "epoch": 0.43426103646833014, + "grad_norm": 18.539564378032264, + "learning_rate": 3.4830026390794633e-07, + "logits/chosen": -1.373583436012268, + "logits/rejected": -1.449741244316101, + "logps/chosen": -525.2714233398438, + "logps/rejected": -705.874755859375, + "loss": 0.4023, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.3716514110565186, + "rewards/margins": 2.285885810852051, + "rewards/rejected": -4.657536506652832, + "step": 1810 + }, + { + "epoch": 0.43666026871401153, + "grad_norm": 15.392536337629252, + "learning_rate": 3.4637165526394104e-07, + "logits/chosen": -1.4993913173675537, + "logits/rejected": -1.5507056713104248, + "logps/chosen": -438.17315673828125, + "logps/rejected": -669.3710327148438, + "loss": 0.4354, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0398783683776855, + "rewards/margins": 2.2368016242980957, + "rewards/rejected": -4.276679515838623, + "step": 1820 + }, + { + "epoch": 0.43905950095969287, + "grad_norm": 10.083282846912516, + "learning_rate": 3.4443628653119814e-07, + "logits/chosen": -1.2605036497116089, + "logits/rejected": -1.272882103919983, + "logps/chosen": -459.35870361328125, + "logps/rejected": -793.3179931640625, + "loss": 0.4781, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9037319421768188, + "rewards/margins": 2.950524091720581, + "rewards/rejected": -4.854256629943848, + "step": 1830 + }, + { + "epoch": 0.44145873320537427, + "grad_norm": 17.777066285729536, + "learning_rate": 3.424942934681453e-07, + "logits/chosen": -1.2640819549560547, + "logits/rejected": -1.4133803844451904, + "logps/chosen": -365.7422790527344, + "logps/rejected": -581.9953002929688, + "loss": 0.4291, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3039973974227905, + "rewards/margins": 2.1755900382995605, + "rewards/rejected": -3.4795870780944824, + "step": 1840 + }, + { + "epoch": 0.44385796545105566, + "grad_norm": 21.190089730860404, + "learning_rate": 3.405458122978804e-07, + "logits/chosen": -1.2760121822357178, + "logits/rejected": -1.3037431240081787, + "logps/chosen": -427.647216796875, + "logps/rejected": -589.5700073242188, + "loss": 0.4056, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.554579496383667, + "rewards/margins": 1.9800221920013428, + "rewards/rejected": -3.5346016883850098, + "step": 1850 + }, + { + "epoch": 0.44625719769673705, + "grad_norm": 19.52879089362984, + "learning_rate": 3.3859097969861633e-07, + "logits/chosen": -1.2224397659301758, + "logits/rejected": -1.2577435970306396, + "logps/chosen": -464.634033203125, + "logps/rejected": -659.4881591796875, + "loss": 0.4462, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7432029247283936, + "rewards/margins": 2.287348508834839, + "rewards/rejected": -4.030551433563232, + "step": 1860 + }, + { + "epoch": 0.44865642994241844, + "grad_norm": 15.71172140020582, + "learning_rate": 3.366299327940936e-07, + "logits/chosen": -1.254504680633545, + "logits/rejected": -1.188957929611206, + "logps/chosen": -473.3033142089844, + "logps/rejected": -726.52392578125, + "loss": 0.4147, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8812453746795654, + "rewards/margins": 2.339358329772949, + "rewards/rejected": -4.2206034660339355, + "step": 1870 + }, + { + "epoch": 0.4510556621880998, + "grad_norm": 13.22641361891775, + "learning_rate": 3.3466280914396117e-07, + "logits/chosen": -1.2382128238677979, + "logits/rejected": -1.2535500526428223, + "logps/chosen": -443.01007080078125, + "logps/rejected": -681.6619262695312, + "loss": 0.4319, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.062727212905884, + "rewards/margins": 2.200326919555664, + "rewards/rejected": -4.2630534172058105, + "step": 1880 + }, + { + "epoch": 0.4534548944337812, + "grad_norm": 13.21370692192793, + "learning_rate": 3.326897467341281e-07, + "logits/chosen": -1.234937310218811, + "logits/rejected": -1.3355581760406494, + "logps/chosen": -445.6819763183594, + "logps/rejected": -728.594970703125, + "loss": 0.4254, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3737926483154297, + "rewards/margins": 2.7364139556884766, + "rewards/rejected": -5.110206127166748, + "step": 1890 + }, + { + "epoch": 0.45585412667946257, + "grad_norm": 13.78564096806611, + "learning_rate": 3.3071088396708335e-07, + "logits/chosen": -1.2770905494689941, + "logits/rejected": -1.240928292274475, + "logps/chosen": -391.79736328125, + "logps/rejected": -723.361083984375, + "loss": 0.4619, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.881150484085083, + "rewards/margins": 3.0526633262634277, + "rewards/rejected": -4.93381404876709, + "step": 1900 + }, + { + "epoch": 0.45825335892514396, + "grad_norm": 13.853814670591044, + "learning_rate": 3.2872635965218824e-07, + "logits/chosen": -1.2004590034484863, + "logits/rejected": -1.2370150089263916, + "logps/chosen": -512.8619995117188, + "logps/rejected": -702.7752685546875, + "loss": 0.4878, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.5109477043151855, + "rewards/margins": 1.8365228176116943, + "rewards/rejected": -4.347470283508301, + "step": 1910 + }, + { + "epoch": 0.46065259117082535, + "grad_norm": 10.692863270526209, + "learning_rate": 3.2673631299593905e-07, + "logits/chosen": -1.173227310180664, + "logits/rejected": -1.3409078121185303, + "logps/chosen": -474.7144470214844, + "logps/rejected": -680.1087036132812, + "loss": 0.4399, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.028782606124878, + "rewards/margins": 2.1353182792663574, + "rewards/rejected": -4.164100646972656, + "step": 1920 + }, + { + "epoch": 0.4630518234165067, + "grad_norm": 14.233116791502368, + "learning_rate": 3.247408835922024e-07, + "logits/chosen": -1.320565104484558, + "logits/rejected": -1.2968575954437256, + "logps/chosen": -573.8089599609375, + "logps/rejected": -786.044189453125, + "loss": 0.4368, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6720101833343506, + "rewards/margins": 2.062215566635132, + "rewards/rejected": -4.734226226806641, + "step": 1930 + }, + { + "epoch": 0.4654510556621881, + "grad_norm": 15.685451696438584, + "learning_rate": 3.2274021141242306e-07, + "logits/chosen": -1.255118489265442, + "logits/rejected": -1.320111870765686, + "logps/chosen": -484.41583251953125, + "logps/rejected": -714.25244140625, + "loss": 0.4476, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.30792236328125, + "rewards/margins": 2.1702935695648193, + "rewards/rejected": -4.47821569442749, + "step": 1940 + }, + { + "epoch": 0.4678502879078695, + "grad_norm": 16.009709587203158, + "learning_rate": 3.2073443679580613e-07, + "logits/chosen": -1.112594723701477, + "logits/rejected": -1.2054228782653809, + "logps/chosen": -423.2367248535156, + "logps/rejected": -543.6708984375, + "loss": 0.4427, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6221225261688232, + "rewards/margins": 1.2181508541107178, + "rewards/rejected": -2.840273380279541, + "step": 1950 + }, + { + "epoch": 0.47024952015355087, + "grad_norm": 11.690186448497471, + "learning_rate": 3.1872370043947194e-07, + "logits/chosen": -1.2797791957855225, + "logits/rejected": -1.3246345520019531, + "logps/chosen": -414.15234375, + "logps/rejected": -695.158203125, + "loss": 0.4033, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4832648038864136, + "rewards/margins": 2.9206230640411377, + "rewards/rejected": -4.40388822555542, + "step": 1960 + }, + { + "epoch": 0.47264875239923226, + "grad_norm": 20.104770187290267, + "learning_rate": 3.167081433885874e-07, + "logits/chosen": -1.030788779258728, + "logits/rejected": -1.1015684604644775, + "logps/chosen": -513.7273559570312, + "logps/rejected": -811.976318359375, + "loss": 0.3855, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.102285623550415, + "rewards/margins": 2.593528985977173, + "rewards/rejected": -4.695814609527588, + "step": 1970 + }, + { + "epoch": 0.4750479846449136, + "grad_norm": 22.35702689198335, + "learning_rate": 3.14687907026472e-07, + "logits/chosen": -1.1791191101074219, + "logits/rejected": -1.2959524393081665, + "logps/chosen": -441.61846923828125, + "logps/rejected": -704.8765869140625, + "loss": 0.409, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.2501838207244873, + "rewards/margins": 2.3681230545043945, + "rewards/rejected": -4.618307113647461, + "step": 1980 + }, + { + "epoch": 0.477447216890595, + "grad_norm": 16.66080453274856, + "learning_rate": 3.126631330646801e-07, + "logits/chosen": -1.2385426759719849, + "logits/rejected": -1.3393757343292236, + "logps/chosen": -572.3016357421875, + "logps/rejected": -789.0521850585938, + "loss": 0.4564, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.771270275115967, + "rewards/margins": 2.124401807785034, + "rewards/rejected": -4.895671844482422, + "step": 1990 + }, + { + "epoch": 0.4798464491362764, + "grad_norm": 13.605651174190195, + "learning_rate": 3.1063396353306097e-07, + "logits/chosen": -1.2395048141479492, + "logits/rejected": -1.3828445672988892, + "logps/chosen": -438.51312255859375, + "logps/rejected": -621.2369384765625, + "loss": 0.4486, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7979936599731445, + "rewards/margins": 2.1893625259399414, + "rewards/rejected": -3.987356185913086, + "step": 2000 + }, + { + "epoch": 0.4822456813819578, + "grad_norm": 13.924076058423626, + "learning_rate": 3.0860054076979535e-07, + "logits/chosen": -1.2428590059280396, + "logits/rejected": -1.236566185951233, + "logps/chosen": -467.281005859375, + "logps/rejected": -621.5033569335938, + "loss": 0.4229, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.916424036026001, + "rewards/margins": 1.8854621648788452, + "rewards/rejected": -3.8018863201141357, + "step": 2010 + }, + { + "epoch": 0.4846449136276392, + "grad_norm": 10.659549298550333, + "learning_rate": 3.065630074114115e-07, + "logits/chosen": -1.2107280492782593, + "logits/rejected": -1.2906858921051025, + "logps/chosen": -437.7901916503906, + "logps/rejected": -730.5772705078125, + "loss": 0.4458, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5543664693832397, + "rewards/margins": 3.3551418781280518, + "rewards/rejected": -4.90950870513916, + "step": 2020 + }, + { + "epoch": 0.4870441458733205, + "grad_norm": 11.89640175081092, + "learning_rate": 3.0452150638277947e-07, + "logits/chosen": -1.0864282846450806, + "logits/rejected": -1.0442813634872437, + "logps/chosen": -426.6461486816406, + "logps/rejected": -630.3248291015625, + "loss": 0.4587, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0740158557891846, + "rewards/margins": 1.9162708520889282, + "rewards/rejected": -3.9902865886688232, + "step": 2030 + }, + { + "epoch": 0.4894433781190019, + "grad_norm": 11.776367189768084, + "learning_rate": 3.024761808870856e-07, + "logits/chosen": -1.307191014289856, + "logits/rejected": -1.312293291091919, + "logps/chosen": -370.92791748046875, + "logps/rejected": -650.8178100585938, + "loss": 0.3818, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.276861310005188, + "rewards/margins": 2.857835531234741, + "rewards/rejected": -4.1346964836120605, + "step": 2040 + }, + { + "epoch": 0.4918426103646833, + "grad_norm": 25.099913631834486, + "learning_rate": 3.004271743957875e-07, + "logits/chosen": -1.1278274059295654, + "logits/rejected": -1.1356306076049805, + "logps/chosen": -495.9942321777344, + "logps/rejected": -648.2416381835938, + "loss": 0.4424, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.38657808303833, + "rewards/margins": 1.207850694656372, + "rewards/rejected": -3.594428539276123, + "step": 2050 + }, + { + "epoch": 0.4942418426103647, + "grad_norm": 12.705340632883466, + "learning_rate": 2.983746306385499e-07, + "logits/chosen": -1.3075058460235596, + "logits/rejected": -1.2933059930801392, + "logps/chosen": -434.21820068359375, + "logps/rejected": -625.4400024414062, + "loss": 0.4253, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8787765502929688, + "rewards/margins": 1.7934147119522095, + "rewards/rejected": -3.6721911430358887, + "step": 2060 + }, + { + "epoch": 0.4966410748560461, + "grad_norm": 13.755685476664564, + "learning_rate": 2.963186935931628e-07, + "logits/chosen": -1.215529203414917, + "logits/rejected": -1.2133440971374512, + "logps/chosen": -403.08465576171875, + "logps/rejected": -577.1477661132812, + "loss": 0.3959, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.499050259590149, + "rewards/margins": 1.7851158380508423, + "rewards/rejected": -3.284165859222412, + "step": 2070 + }, + { + "epoch": 0.4990403071017274, + "grad_norm": 13.292089170044676, + "learning_rate": 2.9425950747544176e-07, + "logits/chosen": -1.1674937009811401, + "logits/rejected": -1.3074105978012085, + "logps/chosen": -525.9224243164062, + "logps/rejected": -763.0833740234375, + "loss": 0.423, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.349907398223877, + "rewards/margins": 2.648500919342041, + "rewards/rejected": -4.998408317565918, + "step": 2080 + }, + { + "epoch": 0.5014395393474088, + "grad_norm": 18.455895560701023, + "learning_rate": 2.921972167291119e-07, + "logits/chosen": -1.1788341999053955, + "logits/rejected": -1.2539576292037964, + "logps/chosen": -472.1956481933594, + "logps/rejected": -670.5008544921875, + "loss": 0.4338, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9172461032867432, + "rewards/margins": 2.0367343425750732, + "rewards/rejected": -3.9539802074432373, + "step": 2090 + }, + { + "epoch": 0.5038387715930902, + "grad_norm": 13.471771310006998, + "learning_rate": 2.9013196601567567e-07, + "logits/chosen": -1.1498690843582153, + "logits/rejected": -1.1755589246749878, + "logps/chosen": -407.24163818359375, + "logps/rejected": -575.2977905273438, + "loss": 0.5056, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5899887084960938, + "rewards/margins": 1.5084218978881836, + "rewards/rejected": -3.0984106063842773, + "step": 2100 + }, + { + "epoch": 0.5062380038387716, + "grad_norm": 15.761320703617217, + "learning_rate": 2.8806390020426555e-07, + "logits/chosen": -1.139108419418335, + "logits/rejected": -1.1348917484283447, + "logps/chosen": -422.84893798828125, + "logps/rejected": -555.7877807617188, + "loss": 0.4274, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4848568439483643, + "rewards/margins": 1.3276598453521729, + "rewards/rejected": -2.812516689300537, + "step": 2110 + }, + { + "epoch": 0.508637236084453, + "grad_norm": 17.997023645014245, + "learning_rate": 2.8599316436148187e-07, + "logits/chosen": -1.2534973621368408, + "logits/rejected": -1.2950940132141113, + "logps/chosen": -413.71490478515625, + "logps/rejected": -538.5707397460938, + "loss": 0.446, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.67266047000885, + "rewards/margins": 1.2678474187850952, + "rewards/rejected": -2.9405078887939453, + "step": 2120 + }, + { + "epoch": 0.5110364683301344, + "grad_norm": 12.638370779776375, + "learning_rate": 2.8391990374121723e-07, + "logits/chosen": -1.288747787475586, + "logits/rejected": -1.3166449069976807, + "logps/chosen": -454.18359375, + "logps/rejected": -724.0892333984375, + "loss": 0.4079, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.0871493816375732, + "rewards/margins": 2.4171881675720215, + "rewards/rejected": -4.504337310791016, + "step": 2130 + }, + { + "epoch": 0.5134357005758158, + "grad_norm": 19.25266010844361, + "learning_rate": 2.818442637744669e-07, + "logits/chosen": -1.332960844039917, + "logits/rejected": -1.3756061792373657, + "logps/chosen": -447.2140197753906, + "logps/rejected": -657.7930297851562, + "loss": 0.4256, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.023087978363037, + "rewards/margins": 2.0285849571228027, + "rewards/rejected": -4.05167293548584, + "step": 2140 + }, + { + "epoch": 0.5158349328214972, + "grad_norm": 17.434973094318284, + "learning_rate": 2.797663900591284e-07, + "logits/chosen": -1.320966124534607, + "logits/rejected": -1.4106026887893677, + "logps/chosen": -476.49267578125, + "logps/rejected": -632.2120361328125, + "loss": 0.3841, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.1607255935668945, + "rewards/margins": 1.7891597747802734, + "rewards/rejected": -3.949885129928589, + "step": 2150 + }, + { + "epoch": 0.5182341650671785, + "grad_norm": 17.474868196021465, + "learning_rate": 2.776864283497874e-07, + "logits/chosen": -1.2660505771636963, + "logits/rejected": -1.3750221729278564, + "logps/chosen": -442.905029296875, + "logps/rejected": -755.1177978515625, + "loss": 0.4058, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.1417970657348633, + "rewards/margins": 3.1892342567443848, + "rewards/rejected": -5.331031322479248, + "step": 2160 + }, + { + "epoch": 0.5206333973128598, + "grad_norm": 14.837534970558897, + "learning_rate": 2.756045245474943e-07, + "logits/chosen": -1.1648900508880615, + "logits/rejected": -1.1385093927383423, + "logps/chosen": -472.541015625, + "logps/rejected": -685.985107421875, + "loss": 0.4347, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.109835147857666, + "rewards/margins": 2.0320799350738525, + "rewards/rejected": -4.1419148445129395, + "step": 2170 + }, + { + "epoch": 0.5230326295585412, + "grad_norm": 12.732468692827648, + "learning_rate": 2.7352082468952977e-07, + "logits/chosen": -1.1894464492797852, + "logits/rejected": -1.2552679777145386, + "logps/chosen": -484.76409912109375, + "logps/rejected": -814.096435546875, + "loss": 0.4487, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.386613130569458, + "rewards/margins": 3.1459081172943115, + "rewards/rejected": -5.532520771026611, + "step": 2180 + }, + { + "epoch": 0.5254318618042226, + "grad_norm": 15.991158601320864, + "learning_rate": 2.7143547493916e-07, + "logits/chosen": -1.2773798704147339, + "logits/rejected": -1.2588646411895752, + "logps/chosen": -438.006591796875, + "logps/rejected": -779.8575439453125, + "loss": 0.4461, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8076789379119873, + "rewards/margins": 3.383274793624878, + "rewards/rejected": -5.190953254699707, + "step": 2190 + }, + { + "epoch": 0.527831094049904, + "grad_norm": 15.0413189840848, + "learning_rate": 2.693486215753853e-07, + "logits/chosen": -1.2713805437088013, + "logits/rejected": -1.2931923866271973, + "logps/chosen": -474.3173828125, + "logps/rejected": -771.8619384765625, + "loss": 0.4418, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2440552711486816, + "rewards/margins": 3.1775963306427, + "rewards/rejected": -5.421651840209961, + "step": 2200 + }, + { + "epoch": 0.5302303262955854, + "grad_norm": 12.016086648025393, + "learning_rate": 2.6726041098267805e-07, + "logits/chosen": -1.1932638883590698, + "logits/rejected": -1.2385377883911133, + "logps/chosen": -468.5335998535156, + "logps/rejected": -564.7030029296875, + "loss": 0.4914, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7891786098480225, + "rewards/margins": 1.3260656595230103, + "rewards/rejected": -3.1152443885803223, + "step": 2210 + }, + { + "epoch": 0.5326295585412668, + "grad_norm": 15.752188949724715, + "learning_rate": 2.6517098964071507e-07, + "logits/chosen": -1.2558810710906982, + "logits/rejected": -1.3106260299682617, + "logps/chosen": -388.71038818359375, + "logps/rejected": -497.370361328125, + "loss": 0.4766, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3813974857330322, + "rewards/margins": 0.8968814015388489, + "rewards/rejected": -2.2782788276672363, + "step": 2220 + }, + { + "epoch": 0.5350287907869482, + "grad_norm": 17.371358487239537, + "learning_rate": 2.630805041141023e-07, + "logits/chosen": -1.3759686946868896, + "logits/rejected": -1.3909227848052979, + "logps/chosen": -353.22833251953125, + "logps/rejected": -674.2232666015625, + "loss": 0.4387, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.275880217552185, + "rewards/margins": 2.9993698596954346, + "rewards/rejected": -4.275249481201172, + "step": 2230 + }, + { + "epoch": 0.5374280230326296, + "grad_norm": 15.68257354415993, + "learning_rate": 2.609891010420941e-07, + "logits/chosen": -1.3854516744613647, + "logits/rejected": -1.3968112468719482, + "logps/chosen": -449.6177673339844, + "logps/rejected": -670.1103515625, + "loss": 0.4195, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7667739391326904, + "rewards/margins": 2.2145369052886963, + "rewards/rejected": -3.981311082839966, + "step": 2240 + }, + { + "epoch": 0.539827255278311, + "grad_norm": 17.255086501798605, + "learning_rate": 2.5889692712830674e-07, + "logits/chosen": -1.211531400680542, + "logits/rejected": -1.283849835395813, + "logps/chosen": -389.5179138183594, + "logps/rejected": -602.1033325195312, + "loss": 0.3883, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6021864414215088, + "rewards/margins": 2.1839447021484375, + "rewards/rejected": -3.786130905151367, + "step": 2250 + }, + { + "epoch": 0.5422264875239923, + "grad_norm": 23.697806480913663, + "learning_rate": 2.5680412913042843e-07, + "logits/chosen": -1.4293580055236816, + "logits/rejected": -1.4167674779891968, + "logps/chosen": -482.10028076171875, + "logps/rejected": -786.3612060546875, + "loss": 0.4341, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2943575382232666, + "rewards/margins": 3.070589065551758, + "rewards/rejected": -5.3649468421936035, + "step": 2260 + }, + { + "epoch": 0.5446257197696737, + "grad_norm": 20.5376691734358, + "learning_rate": 2.5471085384992404e-07, + "logits/chosen": -1.3646225929260254, + "logits/rejected": -1.3295977115631104, + "logps/chosen": -490.397705078125, + "logps/rejected": -880.2039184570312, + "loss": 0.4117, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.3713932037353516, + "rewards/margins": 3.905015468597412, + "rewards/rejected": -6.2764081954956055, + "step": 2270 + }, + { + "epoch": 0.5470249520153551, + "grad_norm": 33.70464819485353, + "learning_rate": 2.526172481217381e-07, + "logits/chosen": -1.345577597618103, + "logits/rejected": -1.318164587020874, + "logps/chosen": -443.73492431640625, + "logps/rejected": -621.4215698242188, + "loss": 0.4513, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3964602947235107, + "rewards/margins": 1.5979655981063843, + "rewards/rejected": -3.9944260120391846, + "step": 2280 + }, + { + "epoch": 0.5494241842610365, + "grad_norm": 14.933194013231358, + "learning_rate": 2.5052345880399456e-07, + "logits/chosen": -1.3429863452911377, + "logits/rejected": -1.419154405593872, + "logps/chosen": -432.6609802246094, + "logps/rejected": -600.9601440429688, + "loss": 0.423, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0265350341796875, + "rewards/margins": 1.6890513896942139, + "rewards/rejected": -3.7155869007110596, + "step": 2290 + }, + { + "epoch": 0.5518234165067178, + "grad_norm": 16.16343931231014, + "learning_rate": 2.4842963276769555e-07, + "logits/chosen": -1.3177053928375244, + "logits/rejected": -1.2920299768447876, + "logps/chosen": -412.7960510253906, + "logps/rejected": -659.1737060546875, + "loss": 0.4355, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.99508535861969, + "rewards/margins": 2.033306360244751, + "rewards/rejected": -4.0283918380737305, + "step": 2300 + }, + { + "epoch": 0.5542226487523992, + "grad_norm": 15.4620970327947, + "learning_rate": 2.463359168864189e-07, + "logits/chosen": -1.1979784965515137, + "logits/rejected": -1.3762614727020264, + "logps/chosen": -500.4812927246094, + "logps/rejected": -641.75537109375, + "loss": 0.4751, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.1022884845733643, + "rewards/margins": 1.735335111618042, + "rewards/rejected": -3.8376235961914062, + "step": 2310 + }, + { + "epoch": 0.5566218809980806, + "grad_norm": 17.931731909002746, + "learning_rate": 2.4424245802601555e-07, + "logits/chosen": -1.2692848443984985, + "logits/rejected": -1.2660033702850342, + "logps/chosen": -370.6279296875, + "logps/rejected": -573.5665283203125, + "loss": 0.4162, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5542004108428955, + "rewards/margins": 1.4760851860046387, + "rewards/rejected": -3.0302860736846924, + "step": 2320 + }, + { + "epoch": 0.559021113243762, + "grad_norm": 17.293463950337944, + "learning_rate": 2.421494030343072e-07, + "logits/chosen": -1.2170095443725586, + "logits/rejected": -1.3737263679504395, + "logps/chosen": -454.30224609375, + "logps/rejected": -541.3650512695312, + "loss": 0.5052, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.897562026977539, + "rewards/margins": 1.3505643606185913, + "rewards/rejected": -3.248126268386841, + "step": 2330 + }, + { + "epoch": 0.5614203454894434, + "grad_norm": 14.543185750157452, + "learning_rate": 2.400568987307861e-07, + "logits/chosen": -1.2438604831695557, + "logits/rejected": -1.3291311264038086, + "logps/chosen": -413.92144775390625, + "logps/rejected": -483.96240234375, + "loss": 0.3981, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.790085792541504, + "rewards/margins": 0.866573691368103, + "rewards/rejected": -2.6566593647003174, + "step": 2340 + }, + { + "epoch": 0.5638195777351248, + "grad_norm": 12.815153330284657, + "learning_rate": 2.379650918963156e-07, + "logits/chosen": -1.3083336353302002, + "logits/rejected": -1.3296959400177002, + "logps/chosen": -408.14068603515625, + "logps/rejected": -637.8969116210938, + "loss": 0.4107, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.137500524520874, + "rewards/margins": 2.198113203048706, + "rewards/rejected": -4.33561372756958, + "step": 2350 + }, + { + "epoch": 0.5662188099808061, + "grad_norm": 19.62558947066615, + "learning_rate": 2.3587412926283438e-07, + "logits/chosen": -1.3259559869766235, + "logits/rejected": -1.3641198873519897, + "logps/chosen": -511.7887268066406, + "logps/rejected": -747.3419189453125, + "loss": 0.4316, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.02677845954895, + "rewards/margins": 2.806626796722412, + "rewards/rejected": -4.833405017852783, + "step": 2360 + }, + { + "epoch": 0.5686180422264875, + "grad_norm": 21.00689689996569, + "learning_rate": 2.337841575030642e-07, + "logits/chosen": -1.1748692989349365, + "logits/rejected": -1.220413327217102, + "logps/chosen": -463.1639099121094, + "logps/rejected": -683.6637573242188, + "loss": 0.3981, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8594157695770264, + "rewards/margins": 2.1025776863098145, + "rewards/rejected": -3.96199369430542, + "step": 2370 + }, + { + "epoch": 0.5710172744721689, + "grad_norm": 16.379604246595516, + "learning_rate": 2.316953232202206e-07, + "logits/chosen": -1.2951033115386963, + "logits/rejected": -1.4975831508636475, + "logps/chosen": -450.65155029296875, + "logps/rejected": -539.8964233398438, + "loss": 0.4165, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0984275341033936, + "rewards/margins": 1.536707878112793, + "rewards/rejected": -3.6351349353790283, + "step": 2380 + }, + { + "epoch": 0.5734165067178503, + "grad_norm": 13.015590564989903, + "learning_rate": 2.2960777293772958e-07, + "logits/chosen": -1.2712581157684326, + "logits/rejected": -1.3997230529785156, + "logps/chosen": -406.917236328125, + "logps/rejected": -671.5166625976562, + "loss": 0.4366, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.939073920249939, + "rewards/margins": 2.8481147289276123, + "rewards/rejected": -4.787188529968262, + "step": 2390 + }, + { + "epoch": 0.5758157389635317, + "grad_norm": 13.201413526903568, + "learning_rate": 2.2752165308894974e-07, + "logits/chosen": -1.2536894083023071, + "logits/rejected": -1.2781312465667725, + "logps/chosen": -368.1941833496094, + "logps/rejected": -593.6776123046875, + "loss": 0.4295, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6818746328353882, + "rewards/margins": 2.3582541942596436, + "rewards/rejected": -4.040129661560059, + "step": 2400 + }, + { + "epoch": 0.5782149712092131, + "grad_norm": 12.990669417950757, + "learning_rate": 2.254371100069005e-07, + "logits/chosen": -1.1515527963638306, + "logits/rejected": -1.1078431606292725, + "logps/chosen": -375.81732177734375, + "logps/rejected": -583.7494506835938, + "loss": 0.3986, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.465872049331665, + "rewards/margins": 1.888287901878357, + "rewards/rejected": -3.3541598320007324, + "step": 2410 + }, + { + "epoch": 0.5806142034548945, + "grad_norm": 17.818623986164003, + "learning_rate": 2.2335428991399725e-07, + "logits/chosen": -1.242234468460083, + "logits/rejected": -1.2769407033920288, + "logps/chosen": -519.7703857421875, + "logps/rejected": -948.0362548828125, + "loss": 0.403, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.072312116622925, + "rewards/margins": 4.26052188873291, + "rewards/rejected": -7.332834720611572, + "step": 2420 + }, + { + "epoch": 0.5830134357005758, + "grad_norm": 14.48204764987916, + "learning_rate": 2.2127333891179458e-07, + "logits/chosen": -1.3385140895843506, + "logits/rejected": -1.3878307342529297, + "logps/chosen": -430.9832458496094, + "logps/rejected": -745.4675903320312, + "loss": 0.4398, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.236307144165039, + "rewards/margins": 2.929969310760498, + "rewards/rejected": -5.166275978088379, + "step": 2430 + }, + { + "epoch": 0.5854126679462572, + "grad_norm": 27.82308149849405, + "learning_rate": 2.1919440297073782e-07, + "logits/chosen": -1.271857500076294, + "logits/rejected": -1.3383657932281494, + "logps/chosen": -462.377685546875, + "logps/rejected": -775.3133544921875, + "loss": 0.4672, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.4519903659820557, + "rewards/margins": 3.0875110626220703, + "rewards/rejected": -5.539501190185547, + "step": 2440 + }, + { + "epoch": 0.5878119001919386, + "grad_norm": 13.320769149199382, + "learning_rate": 2.1711762791992368e-07, + "logits/chosen": -1.2566578388214111, + "logits/rejected": -1.29204261302948, + "logps/chosen": -504.62371826171875, + "logps/rejected": -647.8597412109375, + "loss": 0.4486, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.186826705932617, + "rewards/margins": 1.7466872930526733, + "rewards/rejected": -3.9335131645202637, + "step": 2450 + }, + { + "epoch": 0.5902111324376199, + "grad_norm": 15.89239004487952, + "learning_rate": 2.1504315943687114e-07, + "logits/chosen": -1.135602355003357, + "logits/rejected": -1.110710859298706, + "logps/chosen": -410.08441162109375, + "logps/rejected": -693.5352783203125, + "loss": 0.4104, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8059812784194946, + "rewards/margins": 2.415012836456299, + "rewards/rejected": -4.220993995666504, + "step": 2460 + }, + { + "epoch": 0.5926103646833013, + "grad_norm": 19.370614206098328, + "learning_rate": 2.1297114303730248e-07, + "logits/chosen": -1.1299896240234375, + "logits/rejected": -1.0449109077453613, + "logps/chosen": -411.85040283203125, + "logps/rejected": -708.7200927734375, + "loss": 0.4912, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.996572494506836, + "rewards/margins": 2.44885516166687, + "rewards/rejected": -4.445427894592285, + "step": 2470 + }, + { + "epoch": 0.5950095969289827, + "grad_norm": 16.656657262376168, + "learning_rate": 2.1090172406493616e-07, + "logits/chosen": -1.0786526203155518, + "logits/rejected": -1.0406323671340942, + "logps/chosen": -351.46417236328125, + "logps/rejected": -575.1397705078125, + "loss": 0.3825, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2345200777053833, + "rewards/margins": 2.170752763748169, + "rewards/rejected": -3.4052727222442627, + "step": 2480 + }, + { + "epoch": 0.5974088291746641, + "grad_norm": 19.934123908947278, + "learning_rate": 2.0883504768129146e-07, + "logits/chosen": -1.242959976196289, + "logits/rejected": -1.2493782043457031, + "logps/chosen": -472.5721130371094, + "logps/rejected": -716.5457763671875, + "loss": 0.431, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.007760763168335, + "rewards/margins": 2.496676206588745, + "rewards/rejected": -4.504437446594238, + "step": 2490 + }, + { + "epoch": 0.5998080614203455, + "grad_norm": 15.746281719328787, + "learning_rate": 2.0677125885550571e-07, + "logits/chosen": -1.1213797330856323, + "logits/rejected": -1.3027660846710205, + "logps/chosen": -431.63641357421875, + "logps/rejected": -593.3057861328125, + "loss": 0.4323, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9971039295196533, + "rewards/margins": 1.9194910526275635, + "rewards/rejected": -3.9165947437286377, + "step": 2500 + }, + { + "epoch": 0.6022072936660269, + "grad_norm": 23.759422926525374, + "learning_rate": 2.0471050235416587e-07, + "logits/chosen": -1.0919904708862305, + "logits/rejected": -1.2857704162597656, + "logps/chosen": -511.79132080078125, + "logps/rejected": -708.4951171875, + "loss": 0.3853, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.4641222953796387, + "rewards/margins": 2.413705348968506, + "rewards/rejected": -4.8778276443481445, + "step": 2510 + }, + { + "epoch": 0.6046065259117083, + "grad_norm": 25.281497673378116, + "learning_rate": 2.026529227311532e-07, + "logits/chosen": -1.229898452758789, + "logits/rejected": -1.2374264001846313, + "logps/chosen": -431.0499572753906, + "logps/rejected": -689.706298828125, + "loss": 0.468, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1411490440368652, + "rewards/margins": 2.5051817893981934, + "rewards/rejected": -4.6463303565979, + "step": 2520 + }, + { + "epoch": 0.6070057581573897, + "grad_norm": 14.693269652927464, + "learning_rate": 2.005986643175036e-07, + "logits/chosen": -1.187809944152832, + "logits/rejected": -1.1687113046646118, + "logps/chosen": -414.11407470703125, + "logps/rejected": -739.9503173828125, + "loss": 0.3685, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.609575867652893, + "rewards/margins": 3.377927780151367, + "rewards/rejected": -4.9875030517578125, + "step": 2530 + }, + { + "epoch": 0.6094049904030711, + "grad_norm": 18.59133617851427, + "learning_rate": 1.9854787121128328e-07, + "logits/chosen": -1.203471302986145, + "logits/rejected": -1.3611973524093628, + "logps/chosen": -412.1192321777344, + "logps/rejected": -546.4465942382812, + "loss": 0.4775, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.861853837966919, + "rewards/margins": 1.7769527435302734, + "rewards/rejected": -3.6388065814971924, + "step": 2540 + }, + { + "epoch": 0.6118042226487524, + "grad_norm": 13.920438386133542, + "learning_rate": 1.9650068726748106e-07, + "logits/chosen": -1.1817299127578735, + "logits/rejected": -1.3077366352081299, + "logps/chosen": -465.7212829589844, + "logps/rejected": -663.9733276367188, + "loss": 0.4559, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.955004334449768, + "rewards/margins": 2.0206496715545654, + "rewards/rejected": -3.975654125213623, + "step": 2550 + }, + { + "epoch": 0.6142034548944337, + "grad_norm": 13.161703470683147, + "learning_rate": 1.9445725608791718e-07, + "logits/chosen": -1.1682353019714355, + "logits/rejected": -1.2265560626983643, + "logps/chosen": -466.87713623046875, + "logps/rejected": -877.6561279296875, + "loss": 0.4242, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.123385190963745, + "rewards/margins": 4.002751350402832, + "rewards/rejected": -6.126136779785156, + "step": 2560 + }, + { + "epoch": 0.6166026871401151, + "grad_norm": 15.36104260148359, + "learning_rate": 1.924177210111705e-07, + "logits/chosen": -1.2761640548706055, + "logits/rejected": -1.3670276403427124, + "logps/chosen": -424.89581298828125, + "logps/rejected": -750.1470947265625, + "loss": 0.4409, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8831450939178467, + "rewards/margins": 3.1273887157440186, + "rewards/rejected": -5.010534286499023, + "step": 2570 + }, + { + "epoch": 0.6190019193857965, + "grad_norm": 10.821617135187617, + "learning_rate": 1.9038222510252364e-07, + "logits/chosen": -1.2255038022994995, + "logits/rejected": -1.249638319015503, + "logps/chosen": -406.5605773925781, + "logps/rejected": -569.8077392578125, + "loss": 0.4144, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5970432758331299, + "rewards/margins": 1.7124197483062744, + "rewards/rejected": -3.3094630241394043, + "step": 2580 + }, + { + "epoch": 0.6214011516314779, + "grad_norm": 20.79834881298733, + "learning_rate": 1.883509111439277e-07, + "logits/chosen": -1.2082470655441284, + "logits/rejected": -1.2324997186660767, + "logps/chosen": -405.4540710449219, + "logps/rejected": -772.509765625, + "loss": 0.4118, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.863959550857544, + "rewards/margins": 3.0209383964538574, + "rewards/rejected": -4.8848981857299805, + "step": 2590 + }, + { + "epoch": 0.6238003838771593, + "grad_norm": 14.594348842540208, + "learning_rate": 1.8632392162398665e-07, + "logits/chosen": -1.1460466384887695, + "logits/rejected": -1.148008108139038, + "logps/chosen": -481.5874938964844, + "logps/rejected": -762.729736328125, + "loss": 0.3784, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.904550313949585, + "rewards/margins": 2.8737847805023193, + "rewards/rejected": -4.778334617614746, + "step": 2600 + }, + { + "epoch": 0.6261996161228407, + "grad_norm": 18.405220413400123, + "learning_rate": 1.84301398727962e-07, + "logits/chosen": -1.2919515371322632, + "logits/rejected": -1.226064682006836, + "logps/chosen": -384.24078369140625, + "logps/rejected": -750.8485107421875, + "loss": 0.4252, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.0034337043762207, + "rewards/margins": 3.35615611076355, + "rewards/rejected": -5.359589576721191, + "step": 2610 + }, + { + "epoch": 0.6285988483685221, + "grad_norm": 20.54137342064033, + "learning_rate": 1.8228348432779966e-07, + "logits/chosen": -1.2917633056640625, + "logits/rejected": -1.313946008682251, + "logps/chosen": -463.4065856933594, + "logps/rejected": -708.0303955078125, + "loss": 0.4268, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.310145854949951, + "rewards/margins": 2.4632420539855957, + "rewards/rejected": -4.773387432098389, + "step": 2620 + }, + { + "epoch": 0.6309980806142035, + "grad_norm": 12.212262130926057, + "learning_rate": 1.8027031997217773e-07, + "logits/chosen": -1.3770760297775269, + "logits/rejected": -1.3869761228561401, + "logps/chosen": -496.34136962890625, + "logps/rejected": -1004.2086791992188, + "loss": 0.3796, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.659231662750244, + "rewards/margins": 4.979175090789795, + "rewards/rejected": -7.638407230377197, + "step": 2630 + }, + { + "epoch": 0.6333973128598849, + "grad_norm": 16.20411120653905, + "learning_rate": 1.7826204687657758e-07, + "logits/chosen": -1.1182453632354736, + "logits/rejected": -1.1282155513763428, + "logps/chosen": -475.62335205078125, + "logps/rejected": -599.8681030273438, + "loss": 0.4083, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9853700399398804, + "rewards/margins": 1.6344798803329468, + "rewards/rejected": -3.619849681854248, + "step": 2640 + }, + { + "epoch": 0.6357965451055663, + "grad_norm": 22.936528003077683, + "learning_rate": 1.762588059133781e-07, + "logits/chosen": -1.1725223064422607, + "logits/rejected": -1.3193824291229248, + "logps/chosen": -507.3565368652344, + "logps/rejected": -717.995849609375, + "loss": 0.4443, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1068882942199707, + "rewards/margins": 2.464982509613037, + "rewards/rejected": -4.57187032699585, + "step": 2650 + }, + { + "epoch": 0.6381957773512476, + "grad_norm": 18.6077043898828, + "learning_rate": 1.7426073760197406e-07, + "logits/chosen": -1.1054435968399048, + "logits/rejected": -1.0801939964294434, + "logps/chosen": -478.20074462890625, + "logps/rejected": -859.6195068359375, + "loss": 0.4259, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2413601875305176, + "rewards/margins": 3.528954029083252, + "rewards/rejected": -5.7703142166137695, + "step": 2660 + }, + { + "epoch": 0.6405950095969289, + "grad_norm": 14.701850577255247, + "learning_rate": 1.7226798209891935e-07, + "logits/chosen": -1.1424671411514282, + "logits/rejected": -1.3594285249710083, + "logps/chosen": -486.80755615234375, + "logps/rejected": -663.4140625, + "loss": 0.3759, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.2926859855651855, + "rewards/margins": 2.290337085723877, + "rewards/rejected": -4.583022594451904, + "step": 2670 + }, + { + "epoch": 0.6429942418426103, + "grad_norm": 23.82001052972649, + "learning_rate": 1.7028067918809535e-07, + "logits/chosen": -1.220568299293518, + "logits/rejected": -1.2531940937042236, + "logps/chosen": -426.9730529785156, + "logps/rejected": -829.8860473632812, + "loss": 0.4311, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0204787254333496, + "rewards/margins": 3.661693572998047, + "rewards/rejected": -5.6821722984313965, + "step": 2680 + }, + { + "epoch": 0.6453934740882917, + "grad_norm": 20.92197517678509, + "learning_rate": 1.6829896827090584e-07, + "logits/chosen": -1.3699915409088135, + "logits/rejected": -1.415359377861023, + "logps/chosen": -504.123291015625, + "logps/rejected": -585.438232421875, + "loss": 0.4529, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.390109062194824, + "rewards/margins": 1.1103547811508179, + "rewards/rejected": -3.5004639625549316, + "step": 2690 + }, + { + "epoch": 0.6477927063339731, + "grad_norm": 11.90165561763133, + "learning_rate": 1.6632298835649844e-07, + "logits/chosen": -1.2418677806854248, + "logits/rejected": -1.2179887294769287, + "logps/chosen": -498.97607421875, + "logps/rejected": -764.77587890625, + "loss": 0.3979, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.3297247886657715, + "rewards/margins": 2.4172799587249756, + "rewards/rejected": -4.747004508972168, + "step": 2700 + }, + { + "epoch": 0.6501919385796545, + "grad_norm": 13.166436756002133, + "learning_rate": 1.6435287805201364e-07, + "logits/chosen": -1.3559385538101196, + "logits/rejected": -1.3355977535247803, + "logps/chosen": -489.180419921875, + "logps/rejected": -652.1635131835938, + "loss": 0.4155, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2472357749938965, + "rewards/margins": 1.647270917892456, + "rewards/rejected": -3.8945069313049316, + "step": 2710 + }, + { + "epoch": 0.6525911708253359, + "grad_norm": 19.18769450791739, + "learning_rate": 1.6238877555286207e-07, + "logits/chosen": -1.3151136636734009, + "logits/rejected": -1.3525760173797607, + "logps/chosen": -442.0126953125, + "logps/rejected": -712.2786865234375, + "loss": 0.3659, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7712196111679077, + "rewards/margins": 2.6433427333831787, + "rewards/rejected": -4.414562702178955, + "step": 2720 + }, + { + "epoch": 0.6549904030710173, + "grad_norm": 16.006816349741847, + "learning_rate": 1.60430818633031e-07, + "logits/chosen": -1.1541482210159302, + "logits/rejected": -1.1810917854309082, + "logps/chosen": -428.27117919921875, + "logps/rejected": -661.9808959960938, + "loss": 0.3728, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8128198385238647, + "rewards/margins": 2.3960225582122803, + "rewards/rejected": -4.2088422775268555, + "step": 2730 + }, + { + "epoch": 0.6573896353166987, + "grad_norm": 15.843743360837514, + "learning_rate": 1.5847914463541939e-07, + "logits/chosen": -1.2659448385238647, + "logits/rejected": -1.3333221673965454, + "logps/chosen": -380.4425354003906, + "logps/rejected": -661.7493896484375, + "loss": 0.3829, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8144347667694092, + "rewards/margins": 2.581631898880005, + "rewards/rejected": -4.396066665649414, + "step": 2740 + }, + { + "epoch": 0.6597888675623801, + "grad_norm": 12.902781532875105, + "learning_rate": 1.5653389046220427e-07, + "logits/chosen": -1.202580451965332, + "logits/rejected": -1.2428548336029053, + "logps/chosen": -393.21392822265625, + "logps/rejected": -585.6798095703125, + "loss": 0.4472, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5873596668243408, + "rewards/margins": 1.7750890254974365, + "rewards/rejected": -3.3624484539031982, + "step": 2750 + }, + { + "epoch": 0.6621880998080614, + "grad_norm": 15.497438468614629, + "learning_rate": 1.545951925652375e-07, + "logits/chosen": -1.1852418184280396, + "logits/rejected": -1.3214651346206665, + "logps/chosen": -513.821044921875, + "logps/rejected": -740.9768676757812, + "loss": 0.4159, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.0691330432891846, + "rewards/margins": 2.799996852874756, + "rewards/rejected": -4.8691301345825195, + "step": 2760 + }, + { + "epoch": 0.6645873320537428, + "grad_norm": 25.023616546810636, + "learning_rate": 1.5266318693647423e-07, + "logits/chosen": -1.2369322776794434, + "logits/rejected": -1.2678784132003784, + "logps/chosen": -495.04425048828125, + "logps/rejected": -637.2513427734375, + "loss": 0.4159, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1835405826568604, + "rewards/margins": 1.4997859001159668, + "rewards/rejected": -3.6833267211914062, + "step": 2770 + }, + { + "epoch": 0.6669865642994242, + "grad_norm": 13.945033201288798, + "learning_rate": 1.5073800909843353e-07, + "logits/chosen": -1.1865065097808838, + "logits/rejected": -1.3370563983917236, + "logps/chosen": -465.9248046875, + "logps/rejected": -663.2384033203125, + "loss": 0.4064, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0080044269561768, + "rewards/margins": 2.541308879852295, + "rewards/rejected": -4.549313545227051, + "step": 2780 + }, + { + "epoch": 0.6693857965451055, + "grad_norm": 16.335549570588938, + "learning_rate": 1.488197940946922e-07, + "logits/chosen": -1.090932846069336, + "logits/rejected": -1.1426491737365723, + "logps/chosen": -454.6915588378906, + "logps/rejected": -645.7820434570312, + "loss": 0.3909, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7976535558700562, + "rewards/margins": 2.447206735610962, + "rewards/rejected": -4.244860649108887, + "step": 2790 + }, + { + "epoch": 0.6717850287907869, + "grad_norm": 19.689368835083336, + "learning_rate": 1.4690867648041167e-07, + "logits/chosen": -1.0205479860305786, + "logits/rejected": -1.2015823125839233, + "logps/chosen": -444.68115234375, + "logps/rejected": -664.7821044921875, + "loss": 0.4133, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8803367614746094, + "rewards/margins": 2.4233222007751465, + "rewards/rejected": -4.303658485412598, + "step": 2800 + }, + { + "epoch": 0.6741842610364683, + "grad_norm": 15.922653221989023, + "learning_rate": 1.4500479031289987e-07, + "logits/chosen": -1.121628999710083, + "logits/rejected": -1.2732969522476196, + "logps/chosen": -426.40631103515625, + "logps/rejected": -628.9437255859375, + "loss": 0.4774, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6420685052871704, + "rewards/margins": 2.0467495918273926, + "rewards/rejected": -3.6888179779052734, + "step": 2810 + }, + { + "epoch": 0.6765834932821497, + "grad_norm": 10.688957500109868, + "learning_rate": 1.4310826914220747e-07, + "logits/chosen": -1.1144278049468994, + "logits/rejected": -1.2004985809326172, + "logps/chosen": -465.1630859375, + "logps/rejected": -635.3046264648438, + "loss": 0.4528, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7992807626724243, + "rewards/margins": 1.7720779180526733, + "rewards/rejected": -3.5713589191436768, + "step": 2820 + }, + { + "epoch": 0.6789827255278311, + "grad_norm": 11.408686793770782, + "learning_rate": 1.412192460017597e-07, + "logits/chosen": -1.2038311958312988, + "logits/rejected": -1.162626028060913, + "logps/chosen": -476.5159606933594, + "logps/rejected": -705.89990234375, + "loss": 0.4191, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.387838840484619, + "rewards/margins": 2.2411282062530518, + "rewards/rejected": -4.628966331481934, + "step": 2830 + }, + { + "epoch": 0.6813819577735125, + "grad_norm": 12.118691511541517, + "learning_rate": 1.3933785339902504e-07, + "logits/chosen": -1.2565038204193115, + "logits/rejected": -1.167474389076233, + "logps/chosen": -400.8456115722656, + "logps/rejected": -641.7380981445312, + "loss": 0.4456, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9326709508895874, + "rewards/margins": 1.984291672706604, + "rewards/rejected": -3.9169623851776123, + "step": 2840 + }, + { + "epoch": 0.6837811900191939, + "grad_norm": 13.932235400427288, + "learning_rate": 1.374642233062197e-07, + "logits/chosen": -1.1590187549591064, + "logits/rejected": -1.3027136325836182, + "logps/chosen": -474.519287109375, + "logps/rejected": -681.8683471679688, + "loss": 0.4318, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.970078468322754, + "rewards/margins": 2.387824535369873, + "rewards/rejected": -4.357902526855469, + "step": 2850 + }, + { + "epoch": 0.6861804222648752, + "grad_norm": 16.723737190217008, + "learning_rate": 1.355984871510511e-07, + "logits/chosen": -1.1410120725631714, + "logits/rejected": -1.1267549991607666, + "logps/chosen": -505.823974609375, + "logps/rejected": -715.5396118164062, + "loss": 0.3923, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1921565532684326, + "rewards/margins": 2.031604290008545, + "rewards/rejected": -4.223761081695557, + "step": 2860 + }, + { + "epoch": 0.6885796545105566, + "grad_norm": 21.50806215898013, + "learning_rate": 1.3374077580749783e-07, + "logits/chosen": -1.3120447397232056, + "logits/rejected": -1.281110405921936, + "logps/chosen": -368.84185791015625, + "logps/rejected": -601.4373168945312, + "loss": 0.4163, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7995599508285522, + "rewards/margins": 2.158607006072998, + "rewards/rejected": -3.958167314529419, + "step": 2870 + }, + { + "epoch": 0.690978886756238, + "grad_norm": 27.638375107613143, + "learning_rate": 1.3189121958663024e-07, + "logits/chosen": -1.1347416639328003, + "logits/rejected": -1.3342390060424805, + "logps/chosen": -551.9705810546875, + "logps/rejected": -637.0390625, + "loss": 0.4529, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.7998149394989014, + "rewards/margins": 1.0880420207977295, + "rewards/rejected": -3.8878567218780518, + "step": 2880 + }, + { + "epoch": 0.6933781190019194, + "grad_norm": 14.692498350185678, + "learning_rate": 1.3004994822746895e-07, + "logits/chosen": -1.2893893718719482, + "logits/rejected": -1.3418468236923218, + "logps/chosen": -442.1363220214844, + "logps/rejected": -626.1585083007812, + "loss": 0.4302, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9635156393051147, + "rewards/margins": 1.7742655277252197, + "rewards/rejected": -3.737781047821045, + "step": 2890 + }, + { + "epoch": 0.6957773512476008, + "grad_norm": 13.677081786503933, + "learning_rate": 1.2821709088788434e-07, + "logits/chosen": -1.0876823663711548, + "logits/rejected": -1.1584521532058716, + "logps/chosen": -408.7326965332031, + "logps/rejected": -630.5333862304688, + "loss": 0.4069, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.062255620956421, + "rewards/margins": 2.1946072578430176, + "rewards/rejected": -4.256862163543701, + "step": 2900 + }, + { + "epoch": 0.6981765834932822, + "grad_norm": 15.872787924761283, + "learning_rate": 1.2639277613553736e-07, + "logits/chosen": -1.3518760204315186, + "logits/rejected": -1.3280441761016846, + "logps/chosen": -398.5044860839844, + "logps/rejected": -600.9732666015625, + "loss": 0.4195, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9100825786590576, + "rewards/margins": 2.0270023345947266, + "rewards/rejected": -3.9370853900909424, + "step": 2910 + }, + { + "epoch": 0.7005758157389635, + "grad_norm": 13.346767559623201, + "learning_rate": 1.2457713193885975e-07, + "logits/chosen": -1.1873770952224731, + "logits/rejected": -1.1855942010879517, + "logps/chosen": -413.6412048339844, + "logps/rejected": -724.1486206054688, + "loss": 0.3688, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3553316593170166, + "rewards/margins": 2.758777379989624, + "rewards/rejected": -5.114109516143799, + "step": 2920 + }, + { + "epoch": 0.7029750479846449, + "grad_norm": 22.448762027485316, + "learning_rate": 1.2277028565807838e-07, + "logits/chosen": -1.2834995985031128, + "logits/rejected": -1.3559261560440063, + "logps/chosen": -453.78985595703125, + "logps/rejected": -671.7575073242188, + "loss": 0.4267, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.049879550933838, + "rewards/margins": 2.2892394065856934, + "rewards/rejected": -4.339118957519531, + "step": 2930 + }, + { + "epoch": 0.7053742802303263, + "grad_norm": 16.129648043941792, + "learning_rate": 1.209723640362815e-07, + "logits/chosen": -1.1959068775177002, + "logits/rejected": -1.2405879497528076, + "logps/chosen": -505.00726318359375, + "logps/rejected": -860.0382690429688, + "loss": 0.4607, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.3982629776000977, + "rewards/margins": 3.6283717155456543, + "rewards/rejected": -6.026634693145752, + "step": 2940 + }, + { + "epoch": 0.7077735124760077, + "grad_norm": 14.433857430526624, + "learning_rate": 1.191834931905277e-07, + "logits/chosen": -1.1443761587142944, + "logits/rejected": -1.1771003007888794, + "logps/chosen": -516.5400390625, + "logps/rejected": -709.7727661132812, + "loss": 0.4379, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.209808826446533, + "rewards/margins": 1.9394118785858154, + "rewards/rejected": -4.1492204666137695, + "step": 2950 + }, + { + "epoch": 0.710172744721689, + "grad_norm": 14.063698311506704, + "learning_rate": 1.1740379860299988e-07, + "logits/chosen": -1.1812469959259033, + "logits/rejected": -1.1932779550552368, + "logps/chosen": -486.75848388671875, + "logps/rejected": -688.697509765625, + "loss": 0.4358, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0185728073120117, + "rewards/margins": 1.8188155889511108, + "rewards/rejected": -3.837387800216675, + "step": 2960 + }, + { + "epoch": 0.7125719769673704, + "grad_norm": 12.25394441206106, + "learning_rate": 1.1563340511220254e-07, + "logits/chosen": -1.1238670349121094, + "logits/rejected": -1.2288951873779297, + "logps/chosen": -479.03082275390625, + "logps/rejected": -721.0484008789062, + "loss": 0.4231, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9672508239746094, + "rewards/margins": 2.5986568927764893, + "rewards/rejected": -4.5659074783325195, + "step": 2970 + }, + { + "epoch": 0.7149712092130518, + "grad_norm": 14.233686065176078, + "learning_rate": 1.1387243690420556e-07, + "logits/chosen": -1.1306841373443604, + "logits/rejected": -1.2058961391448975, + "logps/chosen": -530.5966796875, + "logps/rejected": -791.8387451171875, + "loss": 0.4661, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.097524881362915, + "rewards/margins": 2.7319130897521973, + "rewards/rejected": -4.829438209533691, + "step": 2980 + }, + { + "epoch": 0.7173704414587332, + "grad_norm": 19.693131800311516, + "learning_rate": 1.1212101750393235e-07, + "logits/chosen": -1.255438208580017, + "logits/rejected": -1.3434498310089111, + "logps/chosen": -474.38916015625, + "logps/rejected": -754.1934814453125, + "loss": 0.4109, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2069497108459473, + "rewards/margins": 2.954502582550049, + "rewards/rejected": -5.161452293395996, + "step": 2990 + }, + { + "epoch": 0.7197696737044146, + "grad_norm": 16.97782338199475, + "learning_rate": 1.1037926976649562e-07, + "logits/chosen": -1.1937129497528076, + "logits/rejected": -1.2495759725570679, + "logps/chosen": -488.2049865722656, + "logps/rejected": -799.8907470703125, + "loss": 0.4527, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2300727367401123, + "rewards/margins": 2.9182958602905273, + "rewards/rejected": -5.148368835449219, + "step": 3000 + }, + { + "epoch": 0.722168905950096, + "grad_norm": 18.393491636386905, + "learning_rate": 1.0864731586857936e-07, + "logits/chosen": -1.1326544284820557, + "logits/rejected": -1.2836530208587646, + "logps/chosen": -472.63739013671875, + "logps/rejected": -724.0669555664062, + "loss": 0.3849, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8869655132293701, + "rewards/margins": 2.8039302825927734, + "rewards/rejected": -4.6908955574035645, + "step": 3010 + }, + { + "epoch": 0.7245681381957774, + "grad_norm": 20.305303573951132, + "learning_rate": 1.0692527729986839e-07, + "logits/chosen": -1.117078423500061, + "logits/rejected": -1.2159783840179443, + "logps/chosen": -495.0431213378906, + "logps/rejected": -729.7164306640625, + "loss": 0.3795, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.3929858207702637, + "rewards/margins": 2.559861660003662, + "rewards/rejected": -4.952847003936768, + "step": 3020 + }, + { + "epoch": 0.7269673704414588, + "grad_norm": 18.149863459319363, + "learning_rate": 1.0521327485452692e-07, + "logits/chosen": -1.2258936166763306, + "logits/rejected": -1.2951385974884033, + "logps/chosen": -507.068359375, + "logps/rejected": -806.07421875, + "loss": 0.4145, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.737776279449463, + "rewards/margins": 3.1096084117889404, + "rewards/rejected": -5.847384452819824, + "step": 3030 + }, + { + "epoch": 0.7293666026871402, + "grad_norm": 22.723618390263812, + "learning_rate": 1.0351142862272468e-07, + "logits/chosen": -1.1216206550598145, + "logits/rejected": -1.2867326736450195, + "logps/chosen": -473.12408447265625, + "logps/rejected": -896.2477416992188, + "loss": 0.4208, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6412878036499023, + "rewards/margins": 4.341336727142334, + "rewards/rejected": -6.982624053955078, + "step": 3040 + }, + { + "epoch": 0.7317658349328215, + "grad_norm": 19.527492956725595, + "learning_rate": 1.0181985798221343e-07, + "logits/chosen": -1.1790878772735596, + "logits/rejected": -1.1905752420425415, + "logps/chosen": -470.6045837402344, + "logps/rejected": -782.1622924804688, + "loss": 0.4267, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.268146514892578, + "rewards/margins": 3.100985527038574, + "rewards/rejected": -5.369132041931152, + "step": 3050 + }, + { + "epoch": 0.7341650671785028, + "grad_norm": 17.486447949626083, + "learning_rate": 1.0013868158995329e-07, + "logits/chosen": -1.1854205131530762, + "logits/rejected": -1.268317461013794, + "logps/chosen": -508.628173828125, + "logps/rejected": -689.4181518554688, + "loss": 0.4241, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.384272575378418, + "rewards/margins": 2.0019237995147705, + "rewards/rejected": -4.386197090148926, + "step": 3060 + }, + { + "epoch": 0.7365642994241842, + "grad_norm": 11.876302215183998, + "learning_rate": 9.84680173737887e-08, + "logits/chosen": -1.2807575464248657, + "logits/rejected": -1.3796226978302002, + "logps/chosen": -456.07244873046875, + "logps/rejected": -617.9744873046875, + "loss": 0.4315, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.917088508605957, + "rewards/margins": 2.016084909439087, + "rewards/rejected": -3.933173418045044, + "step": 3070 + }, + { + "epoch": 0.7389635316698656, + "grad_norm": 13.157616210751735, + "learning_rate": 9.680798252417713e-08, + "logits/chosen": -1.3769404888153076, + "logits/rejected": -1.4445879459381104, + "logps/chosen": -420.9883728027344, + "logps/rejected": -629.974853515625, + "loss": 0.4059, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0882909297943115, + "rewards/margins": 1.72428297996521, + "rewards/rejected": -3.8125743865966797, + "step": 3080 + }, + { + "epoch": 0.741362763915547, + "grad_norm": 15.730225571388548, + "learning_rate": 9.515869348596808e-08, + "logits/chosen": -1.1365947723388672, + "logits/rejected": -1.275075912475586, + "logps/chosen": -472.18408203125, + "logps/rejected": -672.9304809570312, + "loss": 0.4284, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8900762796401978, + "rewards/margins": 2.209829807281494, + "rewards/rejected": -4.099905967712402, + "step": 3090 + }, + { + "epoch": 0.7437619961612284, + "grad_norm": 35.145335130920884, + "learning_rate": 9.352026595023493e-08, + "logits/chosen": -1.1994316577911377, + "logits/rejected": -1.2499196529388428, + "logps/chosen": -472.410888671875, + "logps/rejected": -594.9708251953125, + "loss": 0.4289, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9588143825531006, + "rewards/margins": 1.4960181713104248, + "rewards/rejected": -3.4548325538635254, + "step": 3100 + }, + { + "epoch": 0.7461612284069098, + "grad_norm": 15.811033639076967, + "learning_rate": 9.189281484616004e-08, + "logits/chosen": -1.2232351303100586, + "logits/rejected": -1.2345741987228394, + "logps/chosen": -410.38848876953125, + "logps/rejected": -693.65673828125, + "loss": 0.4377, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.075925588607788, + "rewards/margins": 2.4454538822174072, + "rewards/rejected": -4.521379470825195, + "step": 3110 + }, + { + "epoch": 0.7485604606525912, + "grad_norm": 22.73378264594113, + "learning_rate": 9.027645433297249e-08, + "logits/chosen": -1.103428602218628, + "logits/rejected": -1.1875020265579224, + "logps/chosen": -571.9773559570312, + "logps/rejected": -778.9340209960938, + "loss": 0.4763, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.662141799926758, + "rewards/margins": 2.4210867881774902, + "rewards/rejected": -5.083228588104248, + "step": 3120 + }, + { + "epoch": 0.7509596928982726, + "grad_norm": 25.696268435535774, + "learning_rate": 8.867129779194066e-08, + "logits/chosen": -1.244616150856018, + "logits/rejected": -1.353212594985962, + "logps/chosen": -394.8204040527344, + "logps/rejected": -692.91357421875, + "loss": 0.4442, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8012546300888062, + "rewards/margins": 2.963744640350342, + "rewards/rejected": -4.764999866485596, + "step": 3130 + }, + { + "epoch": 0.753358925143954, + "grad_norm": 18.050945946975368, + "learning_rate": 8.707745781841866e-08, + "logits/chosen": -1.1310231685638428, + "logits/rejected": -1.2564256191253662, + "logps/chosen": -449.65362548828125, + "logps/rejected": -771.5709838867188, + "loss": 0.4261, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.2246077060699463, + "rewards/margins": 3.2700328826904297, + "rewards/rejected": -5.494640350341797, + "step": 3140 + }, + { + "epoch": 0.7557581573896354, + "grad_norm": 9.799407812333103, + "learning_rate": 8.549504621394831e-08, + "logits/chosen": -1.2643756866455078, + "logits/rejected": -1.2907123565673828, + "logps/chosen": -408.44683837890625, + "logps/rejected": -748.5787353515625, + "loss": 0.3526, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.775541067123413, + "rewards/margins": 3.4335663318634033, + "rewards/rejected": -5.209107398986816, + "step": 3150 + }, + { + "epoch": 0.7581573896353166, + "grad_norm": 19.977109925350412, + "learning_rate": 8.392417397841703e-08, + "logits/chosen": -1.216590166091919, + "logits/rejected": -1.3006147146224976, + "logps/chosen": -467.8597717285156, + "logps/rejected": -650.8781127929688, + "loss": 0.4394, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0549581050872803, + "rewards/margins": 1.685307502746582, + "rewards/rejected": -3.7402656078338623, + "step": 3160 + }, + { + "epoch": 0.760556621880998, + "grad_norm": 17.465911639708498, + "learning_rate": 8.236495130227083e-08, + "logits/chosen": -1.2152674198150635, + "logits/rejected": -1.3853117227554321, + "logps/chosen": -531.36865234375, + "logps/rejected": -838.9884033203125, + "loss": 0.4623, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.4217429161071777, + "rewards/margins": 3.4808337688446045, + "rewards/rejected": -5.9025774002075195, + "step": 3170 + }, + { + "epoch": 0.7629558541266794, + "grad_norm": 18.484754479900506, + "learning_rate": 8.081748755878612e-08, + "logits/chosen": -1.2535191774368286, + "logits/rejected": -1.3943572044372559, + "logps/chosen": -503.76898193359375, + "logps/rejected": -699.9238891601562, + "loss": 0.4074, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2457404136657715, + "rewards/margins": 2.4922969341278076, + "rewards/rejected": -4.738037109375, + "step": 3180 + }, + { + "epoch": 0.7653550863723608, + "grad_norm": 13.480118265934307, + "learning_rate": 7.928189129639632e-08, + "logits/chosen": -1.1623866558074951, + "logits/rejected": -1.1990493535995483, + "logps/chosen": -440.8330993652344, + "logps/rejected": -671.6934814453125, + "loss": 0.4079, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2158052921295166, + "rewards/margins": 2.1327455043792725, + "rewards/rejected": -4.348550796508789, + "step": 3190 + }, + { + "epoch": 0.7677543186180422, + "grad_norm": 18.218781531194935, + "learning_rate": 7.775827023107834e-08, + "logits/chosen": -1.2353599071502686, + "logits/rejected": -1.2978723049163818, + "logps/chosen": -485.01837158203125, + "logps/rejected": -695.3224487304688, + "loss": 0.4193, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.558899164199829, + "rewards/margins": 1.8993895053863525, + "rewards/rejected": -4.458288669586182, + "step": 3200 + }, + { + "epoch": 0.7701535508637236, + "grad_norm": 23.87409903985215, + "learning_rate": 7.624673123879682e-08, + "logits/chosen": -1.1395564079284668, + "logits/rejected": -1.2785004377365112, + "logps/chosen": -447.79144287109375, + "logps/rejected": -616.0286254882812, + "loss": 0.435, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.080073356628418, + "rewards/margins": 1.8376333713531494, + "rewards/rejected": -3.917706251144409, + "step": 3210 + }, + { + "epoch": 0.772552783109405, + "grad_norm": 20.86151843851289, + "learning_rate": 7.474738034800663e-08, + "logits/chosen": -1.2566897869110107, + "logits/rejected": -1.2571423053741455, + "logps/chosen": -419.4219665527344, + "logps/rejected": -851.1385498046875, + "loss": 0.4659, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.008873701095581, + "rewards/margins": 4.522359371185303, + "rewards/rejected": -6.5312323570251465, + "step": 3220 + }, + { + "epoch": 0.7749520153550864, + "grad_norm": 13.169040672352676, + "learning_rate": 7.326032273221606e-08, + "logits/chosen": -1.3603243827819824, + "logits/rejected": -1.3747715950012207, + "logps/chosen": -500.2078552246094, + "logps/rejected": -785.7582397460938, + "loss": 0.4021, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.25557279586792, + "rewards/margins": 3.053633213043213, + "rewards/rejected": -5.309205532073975, + "step": 3230 + }, + { + "epoch": 0.7773512476007678, + "grad_norm": 16.05314488934455, + "learning_rate": 7.178566270260872e-08, + "logits/chosen": -1.318904995918274, + "logits/rejected": -1.3817777633666992, + "logps/chosen": -521.3937377929688, + "logps/rejected": -800.31103515625, + "loss": 0.4506, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.6105079650878906, + "rewards/margins": 2.589186668395996, + "rewards/rejected": -5.199694633483887, + "step": 3240 + }, + { + "epoch": 0.7797504798464492, + "grad_norm": 14.261695457548557, + "learning_rate": 7.032350370072709e-08, + "logits/chosen": -1.2088521718978882, + "logits/rejected": -1.2967567443847656, + "logps/chosen": -463.45794677734375, + "logps/rejected": -668.6795654296875, + "loss": 0.4028, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.936281442642212, + "rewards/margins": 2.147418975830078, + "rewards/rejected": -4.083700180053711, + "step": 3250 + }, + { + "epoch": 0.7821497120921305, + "grad_norm": 12.79782870680645, + "learning_rate": 6.887394829121596e-08, + "logits/chosen": -1.2623844146728516, + "logits/rejected": -1.3965818881988525, + "logps/chosen": -520.1638793945312, + "logps/rejected": -901.8109130859375, + "loss": 0.3906, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.5400707721710205, + "rewards/margins": 4.0481061935424805, + "rewards/rejected": -6.588177680969238, + "step": 3260 + }, + { + "epoch": 0.7845489443378119, + "grad_norm": 16.59225411377178, + "learning_rate": 6.743709815462833e-08, + "logits/chosen": -1.2642148733139038, + "logits/rejected": -1.3561595678329468, + "logps/chosen": -521.087890625, + "logps/rejected": -813.130615234375, + "loss": 0.4099, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.508223056793213, + "rewards/margins": 3.2700836658477783, + "rewards/rejected": -5.7783074378967285, + "step": 3270 + }, + { + "epoch": 0.7869481765834933, + "grad_norm": 14.993047424273575, + "learning_rate": 6.601305408029287e-08, + "logits/chosen": -1.2957048416137695, + "logits/rejected": -1.418428659439087, + "logps/chosen": -458.10076904296875, + "logps/rejected": -786.9931640625, + "loss": 0.3831, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.213374137878418, + "rewards/margins": 3.280181407928467, + "rewards/rejected": -5.493556022644043, + "step": 3280 + }, + { + "epoch": 0.7893474088291746, + "grad_norm": 15.085926001703353, + "learning_rate": 6.460191595924366e-08, + "logits/chosen": -1.2087162733078003, + "logits/rejected": -1.2654608488082886, + "logps/chosen": -449.6219177246094, + "logps/rejected": -705.5459594726562, + "loss": 0.385, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1014938354492188, + "rewards/margins": 2.5265586376190186, + "rewards/rejected": -4.628052711486816, + "step": 3290 + }, + { + "epoch": 0.791746641074856, + "grad_norm": 13.169922978355546, + "learning_rate": 6.320378277721342e-08, + "logits/chosen": -1.3274714946746826, + "logits/rejected": -1.3772881031036377, + "logps/chosen": -465.1910095214844, + "logps/rejected": -624.9302368164062, + "loss": 0.4013, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.1250336170196533, + "rewards/margins": 1.715985894203186, + "rewards/rejected": -3.841019868850708, + "step": 3300 + }, + { + "epoch": 0.7941458733205374, + "grad_norm": 27.82691688189915, + "learning_rate": 6.181875260769032e-08, + "logits/chosen": -1.2468100786209106, + "logits/rejected": -1.4290482997894287, + "logps/chosen": -521.2555541992188, + "logps/rejected": -781.6026000976562, + "loss": 0.416, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.3911871910095215, + "rewards/margins": 3.2505409717559814, + "rewards/rejected": -5.641728401184082, + "step": 3310 + }, + { + "epoch": 0.7965451055662188, + "grad_norm": 15.737223970644676, + "learning_rate": 6.044692260503797e-08, + "logits/chosen": -1.1637942790985107, + "logits/rejected": -1.2922910451889038, + "logps/chosen": -529.7451171875, + "logps/rejected": -853.4421997070312, + "loss": 0.3675, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.4741718769073486, + "rewards/margins": 3.4323413372039795, + "rewards/rejected": -5.906513214111328, + "step": 3320 + }, + { + "epoch": 0.7989443378119002, + "grad_norm": 13.061674320090848, + "learning_rate": 5.9088388997680984e-08, + "logits/chosen": -1.1912027597427368, + "logits/rejected": -1.3324553966522217, + "logps/chosen": -545.3333740234375, + "logps/rejected": -860.1539306640625, + "loss": 0.394, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.197711944580078, + "rewards/margins": 3.962009906768799, + "rewards/rejected": -6.159722328186035, + "step": 3330 + }, + { + "epoch": 0.8013435700575816, + "grad_norm": 19.85934874401157, + "learning_rate": 5.774324708135439e-08, + "logits/chosen": -1.3464608192443848, + "logits/rejected": -1.4455270767211914, + "logps/chosen": -404.1613464355469, + "logps/rejected": -649.46044921875, + "loss": 0.4296, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9079539775848389, + "rewards/margins": 2.655972480773926, + "rewards/rejected": -4.563926696777344, + "step": 3340 + }, + { + "epoch": 0.803742802303263, + "grad_norm": 10.622344250001166, + "learning_rate": 5.641159121241953e-08, + "logits/chosen": -1.340012788772583, + "logits/rejected": -1.3097373247146606, + "logps/chosen": -495.234375, + "logps/rejected": -826.9255981445312, + "loss": 0.4075, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.605710744857788, + "rewards/margins": 3.028745174407959, + "rewards/rejected": -5.634456157684326, + "step": 3350 + }, + { + "epoch": 0.8061420345489443, + "grad_norm": 15.132572988644833, + "learning_rate": 5.5093514801245106e-08, + "logits/chosen": -1.2543226480484009, + "logits/rejected": -1.3029847145080566, + "logps/chosen": -481.427734375, + "logps/rejected": -770.0972900390625, + "loss": 0.4025, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.3580222129821777, + "rewards/margins": 2.690171241760254, + "rewards/rejected": -5.04819393157959, + "step": 3360 + }, + { + "epoch": 0.8085412667946257, + "grad_norm": 14.038257828539548, + "learning_rate": 5.378911030565453e-08, + "logits/chosen": -1.1603384017944336, + "logits/rejected": -1.213220238685608, + "logps/chosen": -554.8784790039062, + "logps/rejected": -800.203369140625, + "loss": 0.4264, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.7545363903045654, + "rewards/margins": 2.177790403366089, + "rewards/rejected": -4.932326793670654, + "step": 3370 + }, + { + "epoch": 0.8109404990403071, + "grad_norm": 14.786725967482855, + "learning_rate": 5.249846922444101e-08, + "logits/chosen": -1.3400354385375977, + "logits/rejected": -1.4116084575653076, + "logps/chosen": -469.63165283203125, + "logps/rejected": -946.8123779296875, + "loss": 0.3953, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.487299680709839, + "rewards/margins": 4.986563682556152, + "rewards/rejected": -7.473863124847412, + "step": 3380 + }, + { + "epoch": 0.8133397312859885, + "grad_norm": 21.495135238551697, + "learning_rate": 5.122168209094865e-08, + "logits/chosen": -1.2585008144378662, + "logits/rejected": -1.3433778285980225, + "logps/chosen": -421.2335510253906, + "logps/rejected": -548.8929443359375, + "loss": 0.4028, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.097749948501587, + "rewards/margins": 1.2572873830795288, + "rewards/rejected": -3.355037212371826, + "step": 3390 + }, + { + "epoch": 0.8157389635316699, + "grad_norm": 15.584784008274491, + "learning_rate": 4.995883846672222e-08, + "logits/chosen": -1.1034057140350342, + "logits/rejected": -1.2848924398422241, + "logps/chosen": -590.9141845703125, + "logps/rejected": -737.7780151367188, + "loss": 0.4146, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.305182933807373, + "rewards/margins": 2.196429491043091, + "rewards/rejected": -4.501612663269043, + "step": 3400 + }, + { + "epoch": 0.8181381957773513, + "grad_norm": 12.577138913457466, + "learning_rate": 4.871002693522486e-08, + "logits/chosen": -1.2216746807098389, + "logits/rejected": -1.2335078716278076, + "logps/chosen": -490.0227966308594, + "logps/rejected": -685.375732421875, + "loss": 0.4083, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2040867805480957, + "rewards/margins": 2.2901692390441895, + "rewards/rejected": -4.494256496429443, + "step": 3410 + }, + { + "epoch": 0.8205374280230326, + "grad_norm": 12.780423712977628, + "learning_rate": 4.7475335095623956e-08, + "logits/chosen": -1.2985970973968506, + "logits/rejected": -1.2781016826629639, + "logps/chosen": -527.1070556640625, + "logps/rejected": -797.3029174804688, + "loss": 0.4308, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7410807609558105, + "rewards/margins": 2.8199234008789062, + "rewards/rejected": -5.561004638671875, + "step": 3420 + }, + { + "epoch": 0.822936660268714, + "grad_norm": 25.453128411840094, + "learning_rate": 4.6254849556646714e-08, + "logits/chosen": -1.1118319034576416, + "logits/rejected": -1.207648515701294, + "logps/chosen": -549.860595703125, + "logps/rejected": -970.7677612304688, + "loss": 0.4555, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.535156488418579, + "rewards/margins": 4.580714702606201, + "rewards/rejected": -7.115871429443359, + "step": 3430 + }, + { + "epoch": 0.8253358925143954, + "grad_norm": 14.697372336337441, + "learning_rate": 4.504865593050483e-08, + "logits/chosen": -1.2074692249298096, + "logits/rejected": -1.22970449924469, + "logps/chosen": -493.60443115234375, + "logps/rejected": -733.1605224609375, + "loss": 0.4486, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2459022998809814, + "rewards/margins": 2.387206792831421, + "rewards/rejected": -4.633109092712402, + "step": 3440 + }, + { + "epoch": 0.8277351247600768, + "grad_norm": 19.145579102229355, + "learning_rate": 4.385683882688895e-08, + "logits/chosen": -1.0796090364456177, + "logits/rejected": -1.2019519805908203, + "logps/chosen": -480.3756408691406, + "logps/rejected": -558.4412841796875, + "loss": 0.4614, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0972506999969482, + "rewards/margins": 1.18562912940979, + "rewards/rejected": -3.282879590988159, + "step": 3450 + }, + { + "epoch": 0.8301343570057581, + "grad_norm": 17.204779447658726, + "learning_rate": 4.2679481847033985e-08, + "logits/chosen": -1.2189116477966309, + "logits/rejected": -1.2864820957183838, + "logps/chosen": -446.82867431640625, + "logps/rejected": -702.0382080078125, + "loss": 0.4466, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8683035373687744, + "rewards/margins": 2.547211170196533, + "rewards/rejected": -4.415513515472412, + "step": 3460 + }, + { + "epoch": 0.8325335892514395, + "grad_norm": 14.516749938972369, + "learning_rate": 4.151666757785435e-08, + "logits/chosen": -1.1446878910064697, + "logits/rejected": -1.2078421115875244, + "logps/chosen": -391.42791748046875, + "logps/rejected": -725.2740478515625, + "loss": 0.397, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.532698631286621, + "rewards/margins": 3.396210193634033, + "rewards/rejected": -4.928908348083496, + "step": 3470 + }, + { + "epoch": 0.8349328214971209, + "grad_norm": 13.196683844151892, + "learning_rate": 4.036847758615136e-08, + "logits/chosen": -1.091759443283081, + "logits/rejected": -1.2269001007080078, + "logps/chosen": -532.867431640625, + "logps/rejected": -730.4796142578125, + "loss": 0.4464, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.9313502311706543, + "rewards/margins": 1.9276530742645264, + "rewards/rejected": -4.859004020690918, + "step": 3480 + }, + { + "epoch": 0.8373320537428023, + "grad_norm": 12.89862131476654, + "learning_rate": 3.923499241289113e-08, + "logits/chosen": -1.1648555994033813, + "logits/rejected": -1.3147923946380615, + "logps/chosen": -521.7337646484375, + "logps/rejected": -674.8948364257812, + "loss": 0.4302, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.260921001434326, + "rewards/margins": 2.081796646118164, + "rewards/rejected": -4.34271764755249, + "step": 3490 + }, + { + "epoch": 0.8397312859884837, + "grad_norm": 14.173962004603878, + "learning_rate": 3.811629156755541e-08, + "logits/chosen": -1.160355567932129, + "logits/rejected": -1.185987949371338, + "logps/chosen": -477.21728515625, + "logps/rejected": -679.4395751953125, + "loss": 0.4274, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9270728826522827, + "rewards/margins": 2.1360950469970703, + "rewards/rejected": -4.063167572021484, + "step": 3500 + }, + { + "epoch": 0.8421305182341651, + "grad_norm": 11.942462848386326, + "learning_rate": 3.701245352256391e-08, + "logits/chosen": -1.2002038955688477, + "logits/rejected": -1.324733018875122, + "logps/chosen": -478.535400390625, + "logps/rejected": -557.5496826171875, + "loss": 0.4283, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.8901561498641968, + "rewards/margins": 1.0154350996017456, + "rewards/rejected": -2.9055914878845215, + "step": 3510 + }, + { + "epoch": 0.8445297504798465, + "grad_norm": 20.5827327935098, + "learning_rate": 3.592355570776984e-08, + "logits/chosen": -1.173332929611206, + "logits/rejected": -1.2609224319458008, + "logps/chosen": -360.0851745605469, + "logps/rejected": -583.3133544921875, + "loss": 0.4195, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4041774272918701, + "rewards/margins": 2.1920089721679688, + "rewards/rejected": -3.5961861610412598, + "step": 3520 + }, + { + "epoch": 0.8469289827255279, + "grad_norm": 10.471012375510664, + "learning_rate": 3.484967450502904e-08, + "logits/chosen": -1.1107840538024902, + "logits/rejected": -1.2250497341156006, + "logps/chosen": -374.8684997558594, + "logps/rejected": -664.1340942382812, + "loss": 0.4038, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7912622690200806, + "rewards/margins": 2.5201168060302734, + "rewards/rejected": -4.3113789558410645, + "step": 3530 + }, + { + "epoch": 0.8493282149712092, + "grad_norm": 20.111970761124727, + "learning_rate": 3.3790885242841296e-08, + "logits/chosen": -1.0951917171478271, + "logits/rejected": -1.1834380626678467, + "logps/chosen": -451.64190673828125, + "logps/rejected": -770.8523559570312, + "loss": 0.3789, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.0603652000427246, + "rewards/margins": 3.2534384727478027, + "rewards/rejected": -5.313803672790527, + "step": 3540 + }, + { + "epoch": 0.8517274472168906, + "grad_norm": 15.121338349842414, + "learning_rate": 3.274726219106677e-08, + "logits/chosen": -1.051578402519226, + "logits/rejected": -1.135506272315979, + "logps/chosen": -485.54840087890625, + "logps/rejected": -721.75439453125, + "loss": 0.4442, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1253767013549805, + "rewards/margins": 2.3960883617401123, + "rewards/rejected": -4.521464824676514, + "step": 3550 + }, + { + "epoch": 0.8541266794625719, + "grad_norm": 15.152890802383466, + "learning_rate": 3.171887855571642e-08, + "logits/chosen": -1.2348651885986328, + "logits/rejected": -1.2165499925613403, + "logps/chosen": -395.697509765625, + "logps/rejected": -543.8568115234375, + "loss": 0.3826, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7078378200531006, + "rewards/margins": 1.5240461826324463, + "rewards/rejected": -3.231884002685547, + "step": 3560 + }, + { + "epoch": 0.8565259117082533, + "grad_norm": 24.44597593565566, + "learning_rate": 3.070580647381643e-08, + "logits/chosen": -1.1522376537322998, + "logits/rejected": -1.2483961582183838, + "logps/chosen": -406.069091796875, + "logps/rejected": -749.8062133789062, + "loss": 0.4548, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.753334403038025, + "rewards/margins": 3.3889122009277344, + "rewards/rejected": -5.142246246337891, + "step": 3570 + }, + { + "epoch": 0.8589251439539347, + "grad_norm": 15.075419291770242, + "learning_rate": 2.9708117008348576e-08, + "logits/chosen": -1.2388461828231812, + "logits/rejected": -1.3630428314208984, + "logps/chosen": -477.43585205078125, + "logps/rejected": -610.2297973632812, + "loss": 0.3969, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.715201735496521, + "rewards/margins": 1.8291780948638916, + "rewards/rejected": -3.544379711151123, + "step": 3580 + }, + { + "epoch": 0.8613243761996161, + "grad_norm": 11.602973232421764, + "learning_rate": 2.8725880143264992e-08, + "logits/chosen": -1.19898521900177, + "logits/rejected": -1.2185986042022705, + "logps/chosen": -449.32598876953125, + "logps/rejected": -633.5596923828125, + "loss": 0.465, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1087489128112793, + "rewards/margins": 1.4358699321746826, + "rewards/rejected": -3.544618606567383, + "step": 3590 + }, + { + "epoch": 0.8637236084452975, + "grad_norm": 21.3005453219283, + "learning_rate": 2.775916477857948e-08, + "logits/chosen": -1.1370158195495605, + "logits/rejected": -1.1749649047851562, + "logps/chosen": -402.0398254394531, + "logps/rejected": -587.6260375976562, + "loss": 0.413, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9864225387573242, + "rewards/margins": 1.8402111530303955, + "rewards/rejected": -3.8266334533691406, + "step": 3600 + }, + { + "epoch": 0.8661228406909789, + "grad_norm": 15.387678499543219, + "learning_rate": 2.680803872553408e-08, + "logits/chosen": -1.2096471786499023, + "logits/rejected": -1.3197107315063477, + "logps/chosen": -445.0570373535156, + "logps/rejected": -827.9861450195312, + "loss": 0.4225, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9052565097808838, + "rewards/margins": 4.059884071350098, + "rewards/rejected": -5.965140342712402, + "step": 3610 + }, + { + "epoch": 0.8685220729366603, + "grad_norm": 24.644670995274364, + "learning_rate": 2.5872568701842706e-08, + "logits/chosen": -1.2481223344802856, + "logits/rejected": -1.3282666206359863, + "logps/chosen": -388.14715576171875, + "logps/rejected": -630.1165161132812, + "loss": 0.4617, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7502208948135376, + "rewards/margins": 2.1968894004821777, + "rewards/rejected": -3.9471099376678467, + "step": 3620 + }, + { + "epoch": 0.8709213051823417, + "grad_norm": 20.103907535930773, + "learning_rate": 2.495282032701096e-08, + "logits/chosen": -1.1463677883148193, + "logits/rejected": -1.3338615894317627, + "logps/chosen": -351.9326477050781, + "logps/rejected": -538.684814453125, + "loss": 0.4117, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.522869348526001, + "rewards/margins": 2.161909580230713, + "rewards/rejected": -3.684778928756714, + "step": 3630 + }, + { + "epoch": 0.8733205374280231, + "grad_norm": 14.949192367966893, + "learning_rate": 2.4048858117733133e-08, + "logits/chosen": -1.2300177812576294, + "logits/rejected": -1.345139741897583, + "logps/chosen": -454.37628173828125, + "logps/rejected": -743.2384033203125, + "loss": 0.3729, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8625742197036743, + "rewards/margins": 3.337106227874756, + "rewards/rejected": -5.199681282043457, + "step": 3640 + }, + { + "epoch": 0.8757197696737045, + "grad_norm": 17.259398265317675, + "learning_rate": 2.3160745483366938e-08, + "logits/chosen": -1.2201354503631592, + "logits/rejected": -1.259948492050171, + "logps/chosen": -431.505126953125, + "logps/rejected": -647.8377685546875, + "loss": 0.4208, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0006277561187744, + "rewards/margins": 1.8061168193817139, + "rewards/rejected": -3.806744337081909, + "step": 3650 + }, + { + "epoch": 0.8781190019193857, + "grad_norm": 26.593038878856742, + "learning_rate": 2.2288544721485197e-08, + "logits/chosen": -1.1473274230957031, + "logits/rejected": -1.1531964540481567, + "logps/chosen": -367.77691650390625, + "logps/rejected": -670.8995361328125, + "loss": 0.4029, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.509234070777893, + "rewards/margins": 2.8308663368225098, + "rewards/rejected": -4.340100288391113, + "step": 3660 + }, + { + "epoch": 0.8805182341650671, + "grad_norm": 17.09126226618081, + "learning_rate": 2.1432317013506117e-08, + "logits/chosen": -1.2680007219314575, + "logits/rejected": -1.3855565786361694, + "logps/chosen": -459.4864807128906, + "logps/rejected": -617.2786254882812, + "loss": 0.4376, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9778245687484741, + "rewards/margins": 1.9757041931152344, + "rewards/rejected": -3.953528881072998, + "step": 3670 + }, + { + "epoch": 0.8829174664107485, + "grad_norm": 22.964518655362124, + "learning_rate": 2.0592122420401704e-08, + "logits/chosen": -1.0826637744903564, + "logits/rejected": -1.218787431716919, + "logps/chosen": -442.7049865722656, + "logps/rejected": -622.1304931640625, + "loss": 0.4426, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.082751750946045, + "rewards/margins": 1.7740720510482788, + "rewards/rejected": -3.8568243980407715, + "step": 3680 + }, + { + "epoch": 0.8853166986564299, + "grad_norm": 16.601339844348978, + "learning_rate": 1.976801987848459e-08, + "logits/chosen": -1.2238892316818237, + "logits/rejected": -1.2646934986114502, + "logps/chosen": -454.4891052246094, + "logps/rejected": -779.9896850585938, + "loss": 0.4293, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8976598978042603, + "rewards/margins": 3.114412784576416, + "rewards/rejected": -5.012072563171387, + "step": 3690 + }, + { + "epoch": 0.8877159309021113, + "grad_norm": 17.11872715651324, + "learning_rate": 1.8960067195273987e-08, + "logits/chosen": -1.2767010927200317, + "logits/rejected": -1.3807860612869263, + "logps/chosen": -399.5985412597656, + "logps/rejected": -693.5151977539062, + "loss": 0.3976, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8493973016738892, + "rewards/margins": 2.9295060634613037, + "rewards/rejected": -4.778903484344482, + "step": 3700 + }, + { + "epoch": 0.8901151631477927, + "grad_norm": 16.6413432588174, + "learning_rate": 1.816832104544072e-08, + "logits/chosen": -1.098815679550171, + "logits/rejected": -1.1592333316802979, + "logps/chosen": -470.98553466796875, + "logps/rejected": -625.4569091796875, + "loss": 0.3886, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.982060432434082, + "rewards/margins": 1.7682058811187744, + "rewards/rejected": -3.7502663135528564, + "step": 3710 + }, + { + "epoch": 0.8925143953934741, + "grad_norm": 11.891797748013655, + "learning_rate": 1.7392836966831553e-08, + "logits/chosen": -1.0697181224822998, + "logits/rejected": -1.194319486618042, + "logps/chosen": -475.51898193359375, + "logps/rejected": -717.6954345703125, + "loss": 0.3918, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.0813956260681152, + "rewards/margins": 2.7703046798706055, + "rewards/rejected": -4.851700782775879, + "step": 3720 + }, + { + "epoch": 0.8949136276391555, + "grad_norm": 17.887300709912445, + "learning_rate": 1.663366935657373e-08, + "logits/chosen": -1.2668213844299316, + "logits/rejected": -1.4020304679870605, + "logps/chosen": -410.6656188964844, + "logps/rejected": -631.67041015625, + "loss": 0.4465, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.830963373184204, + "rewards/margins": 2.1420297622680664, + "rewards/rejected": -3.9729931354522705, + "step": 3730 + }, + { + "epoch": 0.8973128598848369, + "grad_norm": 21.650200131935442, + "learning_rate": 1.5890871467258898e-08, + "logits/chosen": -1.0380961894989014, + "logits/rejected": -1.1259255409240723, + "logps/chosen": -538.32568359375, + "logps/rejected": -709.1238403320312, + "loss": 0.4203, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.206559896469116, + "rewards/margins": 2.0453429222106934, + "rewards/rejected": -4.2519025802612305, + "step": 3740 + }, + { + "epoch": 0.8997120921305183, + "grad_norm": 12.405877408803793, + "learning_rate": 1.5164495403207967e-08, + "logits/chosen": -1.2166404724121094, + "logits/rejected": -1.2211335897445679, + "logps/chosen": -480.22528076171875, + "logps/rejected": -792.5941162109375, + "loss": 0.3954, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.314863443374634, + "rewards/margins": 2.8861892223358154, + "rewards/rejected": -5.201052665710449, + "step": 3750 + }, + { + "epoch": 0.9021113243761996, + "grad_norm": 12.728466226110546, + "learning_rate": 1.4454592116815962e-08, + "logits/chosen": -1.1239063739776611, + "logits/rejected": -1.1524550914764404, + "logps/chosen": -412.11602783203125, + "logps/rejected": -646.6449584960938, + "loss": 0.3604, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.66254460811615, + "rewards/margins": 2.2538654804229736, + "rewards/rejected": -3.916410446166992, + "step": 3760 + }, + { + "epoch": 0.904510556621881, + "grad_norm": 11.308755733612493, + "learning_rate": 1.3761211404977934e-08, + "logits/chosen": -1.2462382316589355, + "logits/rejected": -1.2617241144180298, + "logps/chosen": -481.52642822265625, + "logps/rejected": -763.2169189453125, + "loss": 0.3466, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.5226385593414307, + "rewards/margins": 2.925380229949951, + "rewards/rejected": -5.448019504547119, + "step": 3770 + }, + { + "epoch": 0.9069097888675623, + "grad_norm": 20.409261229736146, + "learning_rate": 1.3084401905596177e-08, + "logits/chosen": -1.1374547481536865, + "logits/rejected": -1.3039876222610474, + "logps/chosen": -499.5015563964844, + "logps/rejected": -709.859375, + "loss": 0.4434, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0284790992736816, + "rewards/margins": 2.706104040145874, + "rewards/rejected": -4.734583377838135, + "step": 3780 + }, + { + "epoch": 0.9093090211132437, + "grad_norm": 17.1782781112593, + "learning_rate": 1.2424211094168053e-08, + "logits/chosen": -1.1101362705230713, + "logits/rejected": -1.2460038661956787, + "logps/chosen": -517.1344604492188, + "logps/rejected": -742.2302856445312, + "loss": 0.4041, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.067178726196289, + "rewards/margins": 2.329007625579834, + "rewards/rejected": -4.396185874938965, + "step": 3790 + }, + { + "epoch": 0.9117082533589251, + "grad_norm": 30.065326876665342, + "learning_rate": 1.1780685280456143e-08, + "logits/chosen": -1.237917184829712, + "logits/rejected": -1.3036408424377441, + "logps/chosen": -539.2681274414062, + "logps/rejected": -925.9050903320312, + "loss": 0.4439, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.540894031524658, + "rewards/margins": 3.8281638622283936, + "rewards/rejected": -6.369057655334473, + "step": 3800 + }, + { + "epoch": 0.9141074856046065, + "grad_norm": 20.319800037171195, + "learning_rate": 1.1153869605239564e-08, + "logits/chosen": -1.231994390487671, + "logits/rejected": -1.3546103239059448, + "logps/chosen": -441.52520751953125, + "logps/rejected": -568.5699462890625, + "loss": 0.4091, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8595590591430664, + "rewards/margins": 1.6289126873016357, + "rewards/rejected": -3.488471508026123, + "step": 3810 + }, + { + "epoch": 0.9165067178502879, + "grad_norm": 17.420902765437226, + "learning_rate": 1.0543808037147606e-08, + "logits/chosen": -1.2463701963424683, + "logits/rejected": -1.2932254076004028, + "logps/chosen": -475.19195556640625, + "logps/rejected": -830.6253051757812, + "loss": 0.394, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.176619052886963, + "rewards/margins": 3.5470027923583984, + "rewards/rejected": -5.7236223220825195, + "step": 3820 + }, + { + "epoch": 0.9189059500959693, + "grad_norm": 13.877437771957279, + "learning_rate": 9.95054336957557e-09, + "logits/chosen": -1.245715618133545, + "logits/rejected": -1.245986819267273, + "logps/chosen": -431.06024169921875, + "logps/rejected": -607.4310302734375, + "loss": 0.4043, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.873225450515747, + "rewards/margins": 1.7056442499160767, + "rewards/rejected": -3.578869581222534, + "step": 3830 + }, + { + "epoch": 0.9213051823416507, + "grad_norm": 22.780099057725195, + "learning_rate": 9.37411721768286e-09, + "logits/chosen": -1.3027960062026978, + "logits/rejected": -1.3543469905853271, + "logps/chosen": -500.34527587890625, + "logps/rejected": -797.4141845703125, + "loss": 0.3995, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.248213291168213, + "rewards/margins": 2.695244312286377, + "rewards/rejected": -4.943457126617432, + "step": 3840 + }, + { + "epoch": 0.9237044145873321, + "grad_norm": 18.879115732312695, + "learning_rate": 8.81457001547392e-09, + "logits/chosen": -1.179421067237854, + "logits/rejected": -1.2008545398712158, + "logps/chosen": -445.45611572265625, + "logps/rejected": -636.3636474609375, + "loss": 0.3706, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.106031894683838, + "rewards/margins": 1.8034794330596924, + "rewards/rejected": -3.909511089324951, + "step": 3850 + }, + { + "epoch": 0.9261036468330134, + "grad_norm": 13.946836764722176, + "learning_rate": 8.271941012961942e-09, + "logits/chosen": -1.1962236166000366, + "logits/rejected": -1.1757996082305908, + "logps/chosen": -434.50238037109375, + "logps/rejected": -851.7151489257812, + "loss": 0.4029, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2081894874572754, + "rewards/margins": 3.6091766357421875, + "rewards/rejected": -5.817366600036621, + "step": 3860 + }, + { + "epoch": 0.9285028790786948, + "grad_norm": 21.809897538914896, + "learning_rate": 7.746268273415568e-09, + "logits/chosen": -1.3298779726028442, + "logits/rejected": -1.2550171613693237, + "logps/chosen": -448.08612060546875, + "logps/rejected": -611.9649658203125, + "loss": 0.3966, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8825676441192627, + "rewards/margins": 1.1555382013320923, + "rewards/rejected": -3.0381054878234863, + "step": 3870 + }, + { + "epoch": 0.9309021113243762, + "grad_norm": 13.467439618498904, + "learning_rate": 7.237588670689076e-09, + "logits/chosen": -1.2014961242675781, + "logits/rejected": -1.348229169845581, + "logps/chosen": -491.67352294921875, + "logps/rejected": -760.8187866210938, + "loss": 0.3867, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.231884479522705, + "rewards/margins": 3.0948691368103027, + "rewards/rejected": -5.32675313949585, + "step": 3880 + }, + { + "epoch": 0.9333013435700576, + "grad_norm": 17.106585504517614, + "learning_rate": 6.745937886635606e-09, + "logits/chosen": -1.239768624305725, + "logits/rejected": -1.2870354652404785, + "logps/chosen": -483.9998474121094, + "logps/rejected": -888.3824462890625, + "loss": 0.4068, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.029411554336548, + "rewards/margins": 4.014785289764404, + "rewards/rejected": -6.044196128845215, + "step": 3890 + }, + { + "epoch": 0.935700575815739, + "grad_norm": 17.65443575752541, + "learning_rate": 6.271350408604409e-09, + "logits/chosen": -1.2722991704940796, + "logits/rejected": -1.2935806512832642, + "logps/chosen": -389.75628662109375, + "logps/rejected": -633.9022216796875, + "loss": 0.3956, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7770601511001587, + "rewards/margins": 2.264246702194214, + "rewards/rejected": -4.041306495666504, + "step": 3900 + }, + { + "epoch": 0.9380998080614203, + "grad_norm": 12.612133126164304, + "learning_rate": 5.813859527021487e-09, + "logits/chosen": -1.22406804561615, + "logits/rejected": -1.323305606842041, + "logps/chosen": -508.7425842285156, + "logps/rejected": -782.00830078125, + "loss": 0.338, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.409405469894409, + "rewards/margins": 3.0094313621520996, + "rewards/rejected": -5.418837070465088, + "step": 3910 + }, + { + "epoch": 0.9404990403071017, + "grad_norm": 13.294898316790011, + "learning_rate": 5.373497333054616e-09, + "logits/chosen": -1.2985012531280518, + "logits/rejected": -1.3380292654037476, + "logps/chosen": -493.2757873535156, + "logps/rejected": -623.1602783203125, + "loss": 0.4384, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2983577251434326, + "rewards/margins": 1.3695790767669678, + "rewards/rejected": -3.6679370403289795, + "step": 3920 + }, + { + "epoch": 0.9428982725527831, + "grad_norm": 15.284137135135785, + "learning_rate": 4.950294716362213e-09, + "logits/chosen": -1.2158329486846924, + "logits/rejected": -1.3253077268600464, + "logps/chosen": -508.41424560546875, + "logps/rejected": -638.3739624023438, + "loss": 0.4209, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.2684988975524902, + "rewards/margins": 1.3235927820205688, + "rewards/rejected": -3.5920920372009277, + "step": 3930 + }, + { + "epoch": 0.9452975047984645, + "grad_norm": 15.219945748745712, + "learning_rate": 4.544281362926422e-09, + "logits/chosen": -1.1814640760421753, + "logits/rejected": -1.213180422782898, + "logps/chosen": -480.6612854003906, + "logps/rejected": -714.3478393554688, + "loss": 0.443, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8659789562225342, + "rewards/margins": 2.4482996463775635, + "rewards/rejected": -4.314279079437256, + "step": 3940 + }, + { + "epoch": 0.9476967370441459, + "grad_norm": 12.199705253251745, + "learning_rate": 4.15548575297095e-09, + "logits/chosen": -1.1539068222045898, + "logits/rejected": -1.2610228061676025, + "logps/chosen": -470.3968811035156, + "logps/rejected": -807.7662353515625, + "loss": 0.3508, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2147834300994873, + "rewards/margins": 3.456162214279175, + "rewards/rejected": -5.6709465980529785, + "step": 3950 + }, + { + "epoch": 0.9500959692898272, + "grad_norm": 9.529613600157132, + "learning_rate": 3.7839351589631366e-09, + "logits/chosen": -1.2287517786026, + "logits/rejected": -1.1175611019134521, + "logps/chosen": -470.9747009277344, + "logps/rejected": -738.1583251953125, + "loss": 0.4044, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.675825595855713, + "rewards/margins": 2.222598075866699, + "rewards/rejected": -4.89842414855957, + "step": 3960 + }, + { + "epoch": 0.9524952015355086, + "grad_norm": 18.605428038684874, + "learning_rate": 3.4296556437010405e-09, + "logits/chosen": -1.2765130996704102, + "logits/rejected": -1.2980254888534546, + "logps/chosen": -460.11541748046875, + "logps/rejected": -656.4251098632812, + "loss": 0.4182, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.518393039703369, + "rewards/margins": 2.0247464179992676, + "rewards/rejected": -4.543139457702637, + "step": 3970 + }, + { + "epoch": 0.95489443378119, + "grad_norm": 14.363883415576543, + "learning_rate": 3.092672058485124e-09, + "logits/chosen": -1.3632985353469849, + "logits/rejected": -1.3597389459609985, + "logps/chosen": -513.4880981445312, + "logps/rejected": -859.6558837890625, + "loss": 0.4086, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.856234550476074, + "rewards/margins": 3.326939821243286, + "rewards/rejected": -6.183174133300781, + "step": 3980 + }, + { + "epoch": 0.9572936660268714, + "grad_norm": 24.170474346453393, + "learning_rate": 2.7730080413750356e-09, + "logits/chosen": -1.1727163791656494, + "logits/rejected": -1.296337604522705, + "logps/chosen": -467.4671325683594, + "logps/rejected": -634.9149169921875, + "loss": 0.4019, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.021527051925659, + "rewards/margins": 1.6675231456756592, + "rewards/rejected": -3.6890506744384766, + "step": 3990 + }, + { + "epoch": 0.9596928982725528, + "grad_norm": 15.145186840475311, + "learning_rate": 2.4706860155316033e-09, + "logits/chosen": -1.1688404083251953, + "logits/rejected": -1.2660518884658813, + "logps/chosen": -569.9762573242188, + "logps/rejected": -844.1318359375, + "loss": 0.3981, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3475515842437744, + "rewards/margins": 2.774146556854248, + "rewards/rejected": -5.121697902679443, + "step": 4000 + }, + { + "epoch": 0.9620921305182342, + "grad_norm": 20.980446798559257, + "learning_rate": 2.185727187643843e-09, + "logits/chosen": -1.2156826257705688, + "logits/rejected": -1.2687807083129883, + "logps/chosen": -427.87774658203125, + "logps/rejected": -791.0867309570312, + "loss": 0.4573, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.1100106239318848, + "rewards/margins": 3.6505370140075684, + "rewards/rejected": -5.7605485916137695, + "step": 4010 + }, + { + "epoch": 0.9644913627639156, + "grad_norm": 26.09657808054691, + "learning_rate": 1.9181515464413434e-09, + "logits/chosen": -1.0822070837020874, + "logits/rejected": -1.1274657249450684, + "logps/chosen": -565.4129028320312, + "logps/rejected": -877.5753784179688, + "loss": 0.3914, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.3460755348205566, + "rewards/margins": 3.1658713817596436, + "rewards/rejected": -5.511946678161621, + "step": 4020 + }, + { + "epoch": 0.966890595009597, + "grad_norm": 20.358263015517405, + "learning_rate": 1.6679778612923302e-09, + "logits/chosen": -1.2023911476135254, + "logits/rejected": -1.3535006046295166, + "logps/chosen": -511.30792236328125, + "logps/rejected": -691.3760375976562, + "loss": 0.3992, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.373955011367798, + "rewards/margins": 1.693084716796875, + "rewards/rejected": -4.0670390129089355, + "step": 4030 + }, + { + "epoch": 0.9692898272552783, + "grad_norm": 17.09466339113468, + "learning_rate": 1.43522368088686e-09, + "logits/chosen": -1.2254010438919067, + "logits/rejected": -1.3339544534683228, + "logps/chosen": -497.1849670410156, + "logps/rejected": -889.703125, + "loss": 0.48, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4439101219177246, + "rewards/margins": 3.9405925273895264, + "rewards/rejected": -6.384502410888672, + "step": 4040 + }, + { + "epoch": 0.9716890595009597, + "grad_norm": 17.680515818931642, + "learning_rate": 1.2199053320059993e-09, + "logits/chosen": -1.2316696643829346, + "logits/rejected": -1.2435214519500732, + "logps/chosen": -472.69000244140625, + "logps/rejected": -706.5465087890625, + "loss": 0.3977, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.054659605026245, + "rewards/margins": 2.194859504699707, + "rewards/rejected": -4.249519348144531, + "step": 4050 + }, + { + "epoch": 0.974088291746641, + "grad_norm": 19.737454721261717, + "learning_rate": 1.0220379183764338e-09, + "logits/chosen": -1.2599509954452515, + "logits/rejected": -1.2703198194503784, + "logps/chosen": -364.81658935546875, + "logps/rejected": -650.0493774414062, + "loss": 0.3815, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5390068292617798, + "rewards/margins": 2.7725987434387207, + "rewards/rejected": -4.311606407165527, + "step": 4060 + }, + { + "epoch": 0.9764875239923224, + "grad_norm": 20.629402096960852, + "learning_rate": 8.416353196111503e-10, + "logits/chosen": -1.2555500268936157, + "logits/rejected": -1.2429850101470947, + "logps/chosen": -496.2998046875, + "logps/rejected": -772.2160034179688, + "loss": 0.4317, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.5762507915496826, + "rewards/margins": 2.843451499938965, + "rewards/rejected": -5.419702053070068, + "step": 4070 + }, + { + "epoch": 0.9788867562380038, + "grad_norm": 21.54194458199129, + "learning_rate": 6.787101902356873e-10, + "logits/chosen": -1.3214588165283203, + "logits/rejected": -1.3182449340820312, + "logps/chosen": -515.1784057617188, + "logps/rejected": -786.1739501953125, + "loss": 0.4275, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.5369973182678223, + "rewards/margins": 2.521944999694824, + "rewards/rejected": -5.058941841125488, + "step": 4080 + }, + { + "epoch": 0.9812859884836852, + "grad_norm": 22.64402950848799, + "learning_rate": 5.332739588005953e-10, + "logits/chosen": -1.2550867795944214, + "logits/rejected": -1.322644591331482, + "logps/chosen": -376.3187561035156, + "logps/rejected": -727.2735595703125, + "loss": 0.4059, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7436511516571045, + "rewards/margins": 3.295630693435669, + "rewards/rejected": -5.039282321929932, + "step": 4090 + }, + { + "epoch": 0.9836852207293666, + "grad_norm": 22.391181670910232, + "learning_rate": 4.053368270797164e-10, + "logits/chosen": -1.2337408065795898, + "logits/rejected": -1.2769973278045654, + "logps/chosen": -468.5732421875, + "logps/rejected": -767.9414672851562, + "loss": 0.4017, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.362514019012451, + "rewards/margins": 3.0355606079101562, + "rewards/rejected": -5.398074150085449, + "step": 4100 + }, + { + "epoch": 0.986084452975048, + "grad_norm": 14.32266939448474, + "learning_rate": 2.949077693545354e-10, + "logits/chosen": -1.1863398551940918, + "logits/rejected": -1.2951616048812866, + "logps/chosen": -504.52801513671875, + "logps/rejected": -719.0045166015625, + "loss": 0.4678, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.333911895751953, + "rewards/margins": 1.8691895008087158, + "rewards/rejected": -4.203102111816406, + "step": 4110 + }, + { + "epoch": 0.9884836852207294, + "grad_norm": 23.976635069300592, + "learning_rate": 2.0199453178471047e-10, + "logits/chosen": -1.126481294631958, + "logits/rejected": -1.2799094915390015, + "logps/chosen": -538.2355346679688, + "logps/rejected": -603.18994140625, + "loss": 0.4332, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.318129062652588, + "rewards/margins": 1.0544296503067017, + "rewards/rejected": -3.372559070587158, + "step": 4120 + }, + { + "epoch": 0.9908829174664108, + "grad_norm": 40.95175275330808, + "learning_rate": 1.266036318647301e-10, + "logits/chosen": -1.2519207000732422, + "logits/rejected": -1.3095006942749023, + "logps/chosen": -540.7717895507812, + "logps/rejected": -778.435791015625, + "loss": 0.4474, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4916446208953857, + "rewards/margins": 2.5904107093811035, + "rewards/rejected": -5.08205509185791, + "step": 4130 + }, + { + "epoch": 0.9932821497120922, + "grad_norm": 17.066173818194596, + "learning_rate": 6.874035796672339e-11, + "logits/chosen": -1.2024726867675781, + "logits/rejected": -1.2957074642181396, + "logps/chosen": -512.9348754882812, + "logps/rejected": -842.3238525390625, + "loss": 0.4125, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.164402961730957, + "rewards/margins": 3.843069076538086, + "rewards/rejected": -6.007472515106201, + "step": 4140 + }, + { + "epoch": 0.9956813819577736, + "grad_norm": 20.618667625472924, + "learning_rate": 2.8408768969423458e-11, + "logits/chosen": -1.2141971588134766, + "logits/rejected": -1.2368704080581665, + "logps/chosen": -452.35345458984375, + "logps/rejected": -660.5663452148438, + "loss": 0.3909, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.867018699645996, + "rewards/margins": 2.0285303592681885, + "rewards/rejected": -3.8955492973327637, + "step": 4150 + }, + { + "epoch": 0.9980806142034548, + "grad_norm": 21.52118343490649, + "learning_rate": 5.611693973617271e-12, + "logits/chosen": -1.2614226341247559, + "logits/rejected": -1.27662193775177, + "logps/chosen": -454.05596923828125, + "logps/rejected": -670.5745239257812, + "loss": 0.4453, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.251619338989258, + "rewards/margins": 2.0149483680725098, + "rewards/rejected": -4.266567707061768, + "step": 4160 + }, + { + "epoch": 1.0, + "step": 4168, + "total_flos": 0.0, + "train_loss": 0.4679888197991303, + "train_runtime": 14179.2142, + "train_samples_per_second": 9.406, + "train_steps_per_second": 0.294 + } + ], + "logging_steps": 10, + "max_steps": 4168, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}