{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 2000, "global_step": 5733, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005232862375719519, "grad_norm": 84.98858388556421, "learning_rate": 8.710801393728223e-10, "logits/chosen": -3.3605234622955322, "logits/rejected": -3.29974365234375, "logps/chosen": -511.38861083984375, "logps/rejected": -608.7561645507812, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0052328623757195184, "grad_norm": 79.43213747615805, "learning_rate": 8.710801393728223e-09, "logits/chosen": -2.7466022968292236, "logits/rejected": -2.7475805282592773, "logps/chosen": -345.8673095703125, "logps/rejected": -288.7480163574219, "loss": 0.69, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.007846314460039139, "rewards/margins": 0.002511692699044943, "rewards/rejected": 0.005334621295332909, "step": 10 }, { "epoch": 0.010465724751439037, "grad_norm": 78.17853332408012, "learning_rate": 1.7421602787456446e-08, "logits/chosen": -2.7542061805725098, "logits/rejected": -2.746408224105835, "logps/chosen": -234.43270874023438, "logps/rejected": -222.5880584716797, "loss": 0.6938, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.005010186228901148, "rewards/margins": -0.007448701653629541, "rewards/rejected": 0.0024385168217122555, "step": 20 }, { "epoch": 0.015698587127158554, "grad_norm": 76.48297301996568, "learning_rate": 2.6132404181184667e-08, "logits/chosen": -2.611788749694824, "logits/rejected": -2.5875256061553955, "logps/chosen": -311.51861572265625, "logps/rejected": -283.8876953125, "loss": 0.6939, "rewards/accuracies": 0.625, "rewards/chosen": 0.0030460101552307606, "rewards/margins": 0.00527621153742075, "rewards/rejected": -0.00223020208068192, "step": 30 }, { "epoch": 0.020931449502878074, "grad_norm": 79.23012990922979, "learning_rate": 3.484320557491289e-08, "logits/chosen": -2.8673622608184814, "logits/rejected": -2.744708299636841, "logps/chosen": -327.95245361328125, "logps/rejected": -314.35400390625, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.007563448045402765, "rewards/margins": 0.004399295896291733, "rewards/rejected": 0.003164151683449745, "step": 40 }, { "epoch": 0.026164311878597593, "grad_norm": 83.70050104654835, "learning_rate": 4.355400696864111e-08, "logits/chosen": -2.904484987258911, "logits/rejected": -2.7473223209381104, "logps/chosen": -304.32318115234375, "logps/rejected": -276.08258056640625, "loss": 0.6934, "rewards/accuracies": 0.375, "rewards/chosen": -0.012552691623568535, "rewards/margins": -0.014694769866764545, "rewards/rejected": 0.0021420782431960106, "step": 50 }, { "epoch": 0.03139717425431711, "grad_norm": 81.16686963609796, "learning_rate": 5.2264808362369334e-08, "logits/chosen": -2.760773181915283, "logits/rejected": -2.7229771614074707, "logps/chosen": -281.700927734375, "logps/rejected": -275.43499755859375, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": -0.0024287248961627483, "rewards/margins": 0.00016310028149746358, "rewards/rejected": -0.002591826021671295, "step": 60 }, { "epoch": 0.03663003663003663, "grad_norm": 87.29945601840559, "learning_rate": 6.097560975609756e-08, "logits/chosen": -2.875711679458618, "logits/rejected": -2.777132511138916, "logps/chosen": -340.478515625, "logps/rejected": -273.4681701660156, "loss": 0.6923, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0026040722150355577, "rewards/margins": -0.0013601541286334395, "rewards/rejected": 0.003964226692914963, "step": 70 }, { "epoch": 0.04186289900575615, "grad_norm": 80.29181059309379, "learning_rate": 6.968641114982578e-08, "logits/chosen": -2.816094398498535, "logits/rejected": -2.787306308746338, "logps/chosen": -315.94891357421875, "logps/rejected": -313.4256896972656, "loss": 0.6941, "rewards/accuracies": 0.5, "rewards/chosen": -0.003847701009362936, "rewards/margins": -0.0019189619924873114, "rewards/rejected": -0.0019287395989522338, "step": 80 }, { "epoch": 0.04709576138147567, "grad_norm": 72.93777220164954, "learning_rate": 7.8397212543554e-08, "logits/chosen": -2.819770574569702, "logits/rejected": -2.770904302597046, "logps/chosen": -254.18325805664062, "logps/rejected": -221.6609649658203, "loss": 0.6959, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.003520088968798518, "rewards/margins": 0.007436218671500683, "rewards/rejected": -0.003916130401194096, "step": 90 }, { "epoch": 0.052328623757195186, "grad_norm": 71.94653775183346, "learning_rate": 8.710801393728223e-08, "logits/chosen": -2.8072497844696045, "logits/rejected": -2.679840564727783, "logps/chosen": -229.0762939453125, "logps/rejected": -204.06356811523438, "loss": 0.6902, "rewards/accuracies": 0.625, "rewards/chosen": 0.005685704294592142, "rewards/margins": 0.012103366665542126, "rewards/rejected": -0.006417661905288696, "step": 100 }, { "epoch": 0.0575614861329147, "grad_norm": 78.88850026339597, "learning_rate": 9.581881533101045e-08, "logits/chosen": -2.7377512454986572, "logits/rejected": -2.602003574371338, "logps/chosen": -271.032470703125, "logps/rejected": -203.76358032226562, "loss": 0.6899, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.007942494004964828, "rewards/margins": -0.012209742330014706, "rewards/rejected": 0.004267249722033739, "step": 110 }, { "epoch": 0.06279434850863422, "grad_norm": 76.99435072846764, "learning_rate": 1.0452961672473867e-07, "logits/chosen": -2.879337787628174, "logits/rejected": -2.874720811843872, "logps/chosen": -381.11151123046875, "logps/rejected": -352.8544921875, "loss": 0.6886, "rewards/accuracies": 0.5, "rewards/chosen": -0.0011743694776669145, "rewards/margins": -0.005505544599145651, "rewards/rejected": 0.004331176169216633, "step": 120 }, { "epoch": 0.06802721088435375, "grad_norm": 77.8746093168026, "learning_rate": 1.132404181184669e-07, "logits/chosen": -2.7185428142547607, "logits/rejected": -2.693519115447998, "logps/chosen": -235.90042114257812, "logps/rejected": -245.9110870361328, "loss": 0.691, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00215378450229764, "rewards/margins": 0.008324772119522095, "rewards/rejected": -0.010478556156158447, "step": 130 }, { "epoch": 0.07326007326007326, "grad_norm": 77.75033765594083, "learning_rate": 1.219512195121951e-07, "logits/chosen": -2.6036200523376465, "logits/rejected": -2.63554048538208, "logps/chosen": -307.88336181640625, "logps/rejected": -318.00921630859375, "loss": 0.6876, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.010539834387600422, "rewards/margins": 0.01989991031587124, "rewards/rejected": -0.009360076859593391, "step": 140 }, { "epoch": 0.07849293563579278, "grad_norm": 83.66004050529598, "learning_rate": 1.3066202090592334e-07, "logits/chosen": -2.842623233795166, "logits/rejected": -2.7181859016418457, "logps/chosen": -272.5040588378906, "logps/rejected": -254.56149291992188, "loss": 0.6853, "rewards/accuracies": 0.5, "rewards/chosen": 0.0004568263830151409, "rewards/margins": -0.01074077095836401, "rewards/rejected": 0.011197598651051521, "step": 150 }, { "epoch": 0.0837257980115123, "grad_norm": 80.22034194722806, "learning_rate": 1.3937282229965157e-07, "logits/chosen": -2.8325233459472656, "logits/rejected": -2.6719284057617188, "logps/chosen": -273.96197509765625, "logps/rejected": -241.1158447265625, "loss": 0.6822, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.022296609356999397, "rewards/margins": 0.03128524124622345, "rewards/rejected": -0.008988635614514351, "step": 160 }, { "epoch": 0.08895866038723181, "grad_norm": 90.62330477252651, "learning_rate": 1.480836236933798e-07, "logits/chosen": -2.7778377532958984, "logits/rejected": -2.786107301712036, "logps/chosen": -246.99569702148438, "logps/rejected": -278.2984619140625, "loss": 0.6836, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.000674112350679934, "rewards/margins": 0.008748064748942852, "rewards/rejected": -0.008073952980339527, "step": 170 }, { "epoch": 0.09419152276295134, "grad_norm": 65.34284837182982, "learning_rate": 1.56794425087108e-07, "logits/chosen": -2.7959346771240234, "logits/rejected": -2.655682325363159, "logps/chosen": -301.09344482421875, "logps/rejected": -274.50665283203125, "loss": 0.6801, "rewards/accuracies": 0.625, "rewards/chosen": 0.0035839497577399015, "rewards/margins": 0.035895925015211105, "rewards/rejected": -0.032311975955963135, "step": 180 }, { "epoch": 0.09942438513867086, "grad_norm": 85.2461007419555, "learning_rate": 1.6550522648083622e-07, "logits/chosen": -2.8159756660461426, "logits/rejected": -2.6639835834503174, "logps/chosen": -314.5155029296875, "logps/rejected": -264.17413330078125, "loss": 0.6679, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0449281670153141, "rewards/margins": 0.06909157335758209, "rewards/rejected": -0.02416340447962284, "step": 190 }, { "epoch": 0.10465724751439037, "grad_norm": 70.83663131764116, "learning_rate": 1.7421602787456445e-07, "logits/chosen": -2.7046008110046387, "logits/rejected": -2.6014657020568848, "logps/chosen": -280.58026123046875, "logps/rejected": -305.3539733886719, "loss": 0.6675, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0308912992477417, "rewards/margins": 0.053800441324710846, "rewards/rejected": -0.022909147664904594, "step": 200 }, { "epoch": 0.10989010989010989, "grad_norm": 77.22019352282102, "learning_rate": 1.8292682926829268e-07, "logits/chosen": -2.7690446376800537, "logits/rejected": -2.667555332183838, "logps/chosen": -219.1191864013672, "logps/rejected": -202.34437561035156, "loss": 0.6804, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.03057698905467987, "rewards/margins": 0.01921568438410759, "rewards/rejected": 0.011361300945281982, "step": 210 }, { "epoch": 0.1151229722658294, "grad_norm": 71.09147529011994, "learning_rate": 1.916376306620209e-07, "logits/chosen": -2.8125951290130615, "logits/rejected": -2.7477786540985107, "logps/chosen": -289.70404052734375, "logps/rejected": -251.4872283935547, "loss": 0.6673, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.08494623005390167, "rewards/margins": 0.10478035360574722, "rewards/rejected": -0.01983409747481346, "step": 220 }, { "epoch": 0.12035583464154893, "grad_norm": 70.50795570252929, "learning_rate": 2.003484320557491e-07, "logits/chosen": -2.8188929557800293, "logits/rejected": -2.805239200592041, "logps/chosen": -253.07046508789062, "logps/rejected": -333.8083190917969, "loss": 0.6506, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.052991289645433426, "rewards/margins": 0.08006981760263443, "rewards/rejected": -0.027078529819846153, "step": 230 }, { "epoch": 0.12558869701726844, "grad_norm": 76.4931381931025, "learning_rate": 2.0905923344947734e-07, "logits/chosen": -2.7703301906585693, "logits/rejected": -2.6502628326416016, "logps/chosen": -238.9730682373047, "logps/rejected": -223.41329956054688, "loss": 0.6559, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.011081100441515446, "rewards/margins": 0.06371158361434937, "rewards/rejected": -0.05263049155473709, "step": 240 }, { "epoch": 0.13082155939298795, "grad_norm": 73.95230061923809, "learning_rate": 2.1777003484320556e-07, "logits/chosen": -2.6704938411712646, "logits/rejected": -2.6541714668273926, "logps/chosen": -298.54742431640625, "logps/rejected": -254.73251342773438, "loss": 0.6453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.07010379433631897, "rewards/margins": 0.1818067580461502, "rewards/rejected": -0.11170294135808945, "step": 250 }, { "epoch": 0.1360544217687075, "grad_norm": 83.13952056039882, "learning_rate": 2.264808362369338e-07, "logits/chosen": -2.8034861087799072, "logits/rejected": -2.700900077819824, "logps/chosen": -300.6609802246094, "logps/rejected": -277.47882080078125, "loss": 0.648, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.08231760561466217, "rewards/margins": 0.169277623295784, "rewards/rejected": -0.08696001768112183, "step": 260 }, { "epoch": 0.141287284144427, "grad_norm": 61.076305133734294, "learning_rate": 2.3519163763066202e-07, "logits/chosen": -2.745058536529541, "logits/rejected": -2.6726226806640625, "logps/chosen": -280.98431396484375, "logps/rejected": -280.2472839355469, "loss": 0.6259, "rewards/accuracies": 0.625, "rewards/chosen": -0.012805774807929993, "rewards/margins": 0.050133805721998215, "rewards/rejected": -0.06293957680463791, "step": 270 }, { "epoch": 0.14652014652014653, "grad_norm": 79.19192659206922, "learning_rate": 2.439024390243902e-07, "logits/chosen": -2.6895556449890137, "logits/rejected": -2.7086892127990723, "logps/chosen": -311.8687744140625, "logps/rejected": -293.9984130859375, "loss": 0.6612, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.05314619094133377, "rewards/margins": 0.11733772605657578, "rewards/rejected": -0.064191535115242, "step": 280 }, { "epoch": 0.15175300889586604, "grad_norm": 67.77150674233079, "learning_rate": 2.526132404181184e-07, "logits/chosen": -2.8477418422698975, "logits/rejected": -2.6748757362365723, "logps/chosen": -333.6998291015625, "logps/rejected": -245.5762176513672, "loss": 0.6423, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.04657986760139465, "rewards/margins": 0.14008301496505737, "rewards/rejected": -0.09350315481424332, "step": 290 }, { "epoch": 0.15698587127158556, "grad_norm": 73.86139382185733, "learning_rate": 2.613240418118467e-07, "logits/chosen": -2.8373353481292725, "logits/rejected": -2.7571821212768555, "logps/chosen": -321.17181396484375, "logps/rejected": -259.47735595703125, "loss": 0.6403, "rewards/accuracies": 0.625, "rewards/chosen": -0.055416040122509, "rewards/margins": 0.08808239549398422, "rewards/rejected": -0.14349845051765442, "step": 300 }, { "epoch": 0.16221873364730507, "grad_norm": 65.10105933983293, "learning_rate": 2.700348432055749e-07, "logits/chosen": -2.8734145164489746, "logits/rejected": -2.725494384765625, "logps/chosen": -328.14971923828125, "logps/rejected": -306.2589416503906, "loss": 0.6475, "rewards/accuracies": 0.625, "rewards/chosen": 0.12547266483306885, "rewards/margins": 0.19133004546165466, "rewards/rejected": -0.06585738807916641, "step": 310 }, { "epoch": 0.1674515960230246, "grad_norm": 78.08733800783429, "learning_rate": 2.7874564459930313e-07, "logits/chosen": -2.8610997200012207, "logits/rejected": -2.632071018218994, "logps/chosen": -340.14752197265625, "logps/rejected": -229.4875946044922, "loss": 0.6443, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007843856699764729, "rewards/margins": 0.18785050511360168, "rewards/rejected": -0.19569435715675354, "step": 320 }, { "epoch": 0.1726844583987441, "grad_norm": 76.52379245557374, "learning_rate": 2.874564459930314e-07, "logits/chosen": -2.680572032928467, "logits/rejected": -2.4787521362304688, "logps/chosen": -258.25518798828125, "logps/rejected": -200.94171142578125, "loss": 0.6432, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.007691374514251947, "rewards/margins": 0.15264154970645905, "rewards/rejected": -0.14495018124580383, "step": 330 }, { "epoch": 0.17791732077446362, "grad_norm": 81.16170652139007, "learning_rate": 2.961672473867596e-07, "logits/chosen": -2.794625997543335, "logits/rejected": -2.813786268234253, "logps/chosen": -285.979736328125, "logps/rejected": -308.09039306640625, "loss": 0.6168, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.07205596566200256, "rewards/margins": 0.21761822700500488, "rewards/rejected": -0.14556226134300232, "step": 340 }, { "epoch": 0.18315018315018314, "grad_norm": 72.97655641270019, "learning_rate": 3.048780487804878e-07, "logits/chosen": -2.724668025970459, "logits/rejected": -2.8093247413635254, "logps/chosen": -191.9694061279297, "logps/rejected": -229.0023193359375, "loss": 0.6085, "rewards/accuracies": 0.75, "rewards/chosen": 0.04489254206418991, "rewards/margins": 0.38289427757263184, "rewards/rejected": -0.3380017578601837, "step": 350 }, { "epoch": 0.18838304552590268, "grad_norm": 67.08610224330708, "learning_rate": 3.13588850174216e-07, "logits/chosen": -2.804530382156372, "logits/rejected": -2.7216782569885254, "logps/chosen": -271.109619140625, "logps/rejected": -278.29595947265625, "loss": 0.6135, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03914060443639755, "rewards/margins": 0.2538668215274811, "rewards/rejected": -0.21472623944282532, "step": 360 }, { "epoch": 0.1936159079016222, "grad_norm": 68.81979171610475, "learning_rate": 3.2229965156794425e-07, "logits/chosen": -2.731799602508545, "logits/rejected": -2.6366994380950928, "logps/chosen": -303.15216064453125, "logps/rejected": -274.8804016113281, "loss": 0.6031, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.013546006754040718, "rewards/margins": 0.3800819516181946, "rewards/rejected": -0.39362794160842896, "step": 370 }, { "epoch": 0.1988487702773417, "grad_norm": 70.10460080093853, "learning_rate": 3.3101045296167245e-07, "logits/chosen": -2.6483497619628906, "logits/rejected": -2.7345848083496094, "logps/chosen": -207.73190307617188, "logps/rejected": -335.92767333984375, "loss": 0.6155, "rewards/accuracies": 0.75, "rewards/chosen": 0.03089023567736149, "rewards/margins": 0.2889792323112488, "rewards/rejected": -0.25808900594711304, "step": 380 }, { "epoch": 0.20408163265306123, "grad_norm": 71.01223451909964, "learning_rate": 3.3972125435540065e-07, "logits/chosen": -2.8010830879211426, "logits/rejected": -2.641580104827881, "logps/chosen": -393.96368408203125, "logps/rejected": -272.9259033203125, "loss": 0.616, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.14445623755455017, "rewards/margins": 0.19453056156635284, "rewards/rejected": -0.05007433146238327, "step": 390 }, { "epoch": 0.20931449502878074, "grad_norm": 66.6694321514813, "learning_rate": 3.484320557491289e-07, "logits/chosen": -2.8600525856018066, "logits/rejected": -2.6983630657196045, "logps/chosen": -301.33587646484375, "logps/rejected": -254.32473754882812, "loss": 0.6109, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.061463791877031326, "rewards/margins": 0.3578440845012665, "rewards/rejected": -0.29638028144836426, "step": 400 }, { "epoch": 0.21454735740450026, "grad_norm": 69.21792732884464, "learning_rate": 3.5714285714285716e-07, "logits/chosen": -2.744187831878662, "logits/rejected": -2.722655773162842, "logps/chosen": -285.7259826660156, "logps/rejected": -321.0186462402344, "loss": 0.637, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.012902788817882538, "rewards/margins": 0.26276642084121704, "rewards/rejected": -0.2756691873073578, "step": 410 }, { "epoch": 0.21978021978021978, "grad_norm": 70.14765627243237, "learning_rate": 3.6585365853658536e-07, "logits/chosen": -2.809702157974243, "logits/rejected": -2.777172088623047, "logps/chosen": -287.27740478515625, "logps/rejected": -300.46856689453125, "loss": 0.6015, "rewards/accuracies": 0.5, "rewards/chosen": -0.104688361287117, "rewards/margins": 0.07888265699148178, "rewards/rejected": -0.18357104063034058, "step": 420 }, { "epoch": 0.2250130821559393, "grad_norm": 77.0206106620458, "learning_rate": 3.7456445993031356e-07, "logits/chosen": -2.9213788509368896, "logits/rejected": -2.7455177307128906, "logps/chosen": -320.3589172363281, "logps/rejected": -255.3843536376953, "loss": 0.6119, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.08016510307788849, "rewards/margins": 0.522097647190094, "rewards/rejected": -0.4419324994087219, "step": 430 }, { "epoch": 0.2302459445316588, "grad_norm": 83.29817536015507, "learning_rate": 3.832752613240418e-07, "logits/chosen": -2.768024444580078, "logits/rejected": -2.7121284008026123, "logps/chosen": -279.93682861328125, "logps/rejected": -253.1497344970703, "loss": 0.6162, "rewards/accuracies": 0.625, "rewards/chosen": -0.15743741393089294, "rewards/margins": 0.15811249613761902, "rewards/rejected": -0.31554991006851196, "step": 440 }, { "epoch": 0.23547880690737832, "grad_norm": 74.0680805972226, "learning_rate": 3.9198606271777e-07, "logits/chosen": -2.662078380584717, "logits/rejected": -2.679370641708374, "logps/chosen": -240.2390899658203, "logps/rejected": -248.5304718017578, "loss": 0.5777, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.15076106786727905, "rewards/margins": 0.22496457397937775, "rewards/rejected": -0.375725656747818, "step": 450 }, { "epoch": 0.24071166928309787, "grad_norm": 68.31121974793525, "learning_rate": 4.006968641114982e-07, "logits/chosen": -2.7916362285614014, "logits/rejected": -2.662065029144287, "logps/chosen": -295.880859375, "logps/rejected": -268.3017578125, "loss": 0.605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.04480681195855141, "rewards/margins": 0.40198850631713867, "rewards/rejected": -0.35718169808387756, "step": 460 }, { "epoch": 0.24594453165881738, "grad_norm": 94.24984253858831, "learning_rate": 4.0940766550522647e-07, "logits/chosen": -2.7813093662261963, "logits/rejected": -2.7455015182495117, "logps/chosen": -293.1402587890625, "logps/rejected": -295.59295654296875, "loss": 0.5783, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1348876804113388, "rewards/margins": 0.3201070725917816, "rewards/rejected": -0.4549947679042816, "step": 470 }, { "epoch": 0.25117739403453687, "grad_norm": 79.26291299471244, "learning_rate": 4.1811846689895467e-07, "logits/chosen": -2.763723850250244, "logits/rejected": -2.769278049468994, "logps/chosen": -247.20993041992188, "logps/rejected": -239.54849243164062, "loss": 0.6004, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.041330404579639435, "rewards/margins": 0.4440532624721527, "rewards/rejected": -0.40272289514541626, "step": 480 }, { "epoch": 0.2564102564102564, "grad_norm": 65.08505114883965, "learning_rate": 4.268292682926829e-07, "logits/chosen": -2.7704079151153564, "logits/rejected": -2.7451767921447754, "logps/chosen": -300.32598876953125, "logps/rejected": -264.40692138671875, "loss": 0.576, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.03305836394429207, "rewards/margins": 0.30561283230781555, "rewards/rejected": -0.2725544571876526, "step": 490 }, { "epoch": 0.2616431187859759, "grad_norm": 80.1379910637347, "learning_rate": 4.3554006968641113e-07, "logits/chosen": -2.7437386512756348, "logits/rejected": -2.746515989303589, "logps/chosen": -264.9486389160156, "logps/rejected": -279.97015380859375, "loss": 0.5657, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.012850580736994743, "rewards/margins": 0.6681180000305176, "rewards/rejected": -0.6552674770355225, "step": 500 }, { "epoch": 0.2668759811616955, "grad_norm": 75.29078775030078, "learning_rate": 4.442508710801394e-07, "logits/chosen": -2.8460960388183594, "logits/rejected": -2.6950936317443848, "logps/chosen": -293.77777099609375, "logps/rejected": -257.11553955078125, "loss": 0.5891, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.003244167659431696, "rewards/margins": 0.35232454538345337, "rewards/rejected": -0.3490803837776184, "step": 510 }, { "epoch": 0.272108843537415, "grad_norm": 73.71242826599446, "learning_rate": 4.529616724738676e-07, "logits/chosen": -2.799469232559204, "logits/rejected": -2.779831647872925, "logps/chosen": -274.6687927246094, "logps/rejected": -294.1000671386719, "loss": 0.5846, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.15885034203529358, "rewards/margins": 0.26622113585472107, "rewards/rejected": -0.4250714182853699, "step": 520 }, { "epoch": 0.2773417059131345, "grad_norm": 67.00484336857802, "learning_rate": 4.616724738675958e-07, "logits/chosen": -2.762690305709839, "logits/rejected": -2.7298333644866943, "logps/chosen": -349.2732849121094, "logps/rejected": -257.4483947753906, "loss": 0.5752, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.08133769035339355, "rewards/margins": 0.6366723775863647, "rewards/rejected": -0.5553346872329712, "step": 530 }, { "epoch": 0.282574568288854, "grad_norm": 87.37301032099326, "learning_rate": 4.7038327526132404e-07, "logits/chosen": -2.734611988067627, "logits/rejected": -2.682957887649536, "logps/chosen": -300.8519592285156, "logps/rejected": -287.28131103515625, "loss": 0.5654, "rewards/accuracies": 0.625, "rewards/chosen": -0.16697955131530762, "rewards/margins": 0.40015920996665955, "rewards/rejected": -0.5671387910842896, "step": 540 }, { "epoch": 0.28780743066457354, "grad_norm": 75.87340132424606, "learning_rate": 4.790940766550523e-07, "logits/chosen": -2.881843328475952, "logits/rejected": -2.744377374649048, "logps/chosen": -343.0857238769531, "logps/rejected": -284.81341552734375, "loss": 0.623, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0778597965836525, "rewards/margins": 0.3903868496417999, "rewards/rejected": -0.4682466387748718, "step": 550 }, { "epoch": 0.29304029304029305, "grad_norm": 64.37139615879414, "learning_rate": 4.878048780487804e-07, "logits/chosen": -2.8723576068878174, "logits/rejected": -2.8006961345672607, "logps/chosen": -295.2679748535156, "logps/rejected": -279.12310791015625, "loss": 0.5644, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.01374953705817461, "rewards/margins": 0.5936576128005981, "rewards/rejected": -0.579908013343811, "step": 560 }, { "epoch": 0.29827315541601257, "grad_norm": 76.23718102747013, "learning_rate": 4.965156794425087e-07, "logits/chosen": -2.9436142444610596, "logits/rejected": -2.799950361251831, "logps/chosen": -290.10546875, "logps/rejected": -265.4535217285156, "loss": 0.5782, "rewards/accuracies": 0.75, "rewards/chosen": -0.015673398971557617, "rewards/margins": 0.5041199922561646, "rewards/rejected": -0.5197933912277222, "step": 570 }, { "epoch": 0.3035060177917321, "grad_norm": 64.81448819369754, "learning_rate": 4.999983312905696e-07, "logits/chosen": -2.8923089504241943, "logits/rejected": -2.7781293392181396, "logps/chosen": -324.54986572265625, "logps/rejected": -225.8867645263672, "loss": 0.582, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1136259064078331, "rewards/margins": 0.435842901468277, "rewards/rejected": -0.5494688153266907, "step": 580 }, { "epoch": 0.3087388801674516, "grad_norm": 64.62422308445984, "learning_rate": 4.999881337025014e-07, "logits/chosen": -2.710305690765381, "logits/rejected": -2.7066807746887207, "logps/chosen": -223.7068328857422, "logps/rejected": -228.61758422851562, "loss": 0.5895, "rewards/accuracies": 0.625, "rewards/chosen": -0.13477621972560883, "rewards/margins": 0.29476475715637207, "rewards/rejected": -0.4295410215854645, "step": 590 }, { "epoch": 0.3139717425431711, "grad_norm": 69.1793063849544, "learning_rate": 4.999686659648518e-07, "logits/chosen": -2.7720563411712646, "logits/rejected": -2.7554426193237305, "logps/chosen": -291.6083679199219, "logps/rejected": -280.64044189453125, "loss": 0.6149, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.017333343625068665, "rewards/margins": 0.35141366720199585, "rewards/rejected": -0.3340803384780884, "step": 600 }, { "epoch": 0.31920460491889063, "grad_norm": 71.16580461514887, "learning_rate": 4.999399287995302e-07, "logits/chosen": -2.8262219429016113, "logits/rejected": -2.720231533050537, "logps/chosen": -211.25503540039062, "logps/rejected": -231.1370086669922, "loss": 0.5364, "rewards/accuracies": 0.75, "rewards/chosen": -0.009411575272679329, "rewards/margins": 0.47361668944358826, "rewards/rejected": -0.48302823305130005, "step": 610 }, { "epoch": 0.32443746729461015, "grad_norm": 69.94030395472988, "learning_rate": 4.999019232721791e-07, "logits/chosen": -2.93601655960083, "logits/rejected": -2.685936450958252, "logps/chosen": -368.6300354003906, "logps/rejected": -219.16061401367188, "loss": 0.5753, "rewards/accuracies": 0.75, "rewards/chosen": 0.16614532470703125, "rewards/margins": 0.7339236736297607, "rewards/rejected": -0.5677784085273743, "step": 620 }, { "epoch": 0.32967032967032966, "grad_norm": 59.26941308002417, "learning_rate": 4.998546507921325e-07, "logits/chosen": -2.7378721237182617, "logits/rejected": -2.7604966163635254, "logps/chosen": -233.4509735107422, "logps/rejected": -278.7649841308594, "loss": 0.6109, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.15916521847248077, "rewards/margins": 0.500957190990448, "rewards/rejected": -0.6601223945617676, "step": 630 }, { "epoch": 0.3349031920460492, "grad_norm": 71.51739828519624, "learning_rate": 4.997981131123656e-07, "logits/chosen": -2.8753511905670166, "logits/rejected": -2.7751049995422363, "logps/chosen": -296.11126708984375, "logps/rejected": -306.3322448730469, "loss": 0.5664, "rewards/accuracies": 0.75, "rewards/chosen": 0.11808276176452637, "rewards/margins": 0.7307382822036743, "rewards/rejected": -0.612655520439148, "step": 640 }, { "epoch": 0.3401360544217687, "grad_norm": 63.71373253795031, "learning_rate": 4.997323123294291e-07, "logits/chosen": -2.8338544368743896, "logits/rejected": -2.7727811336517334, "logps/chosen": -275.8819274902344, "logps/rejected": -252.6679229736328, "loss": 0.5886, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.004488348960876465, "rewards/margins": 0.5318983197212219, "rewards/rejected": -0.5274099707603455, "step": 650 }, { "epoch": 0.3453689167974882, "grad_norm": 68.36944996832233, "learning_rate": 4.99657250883371e-07, "logits/chosen": -2.7883529663085938, "logits/rejected": -2.727940797805786, "logps/chosen": -244.5763702392578, "logps/rejected": -236.46859741210938, "loss": 0.5638, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.09980012476444244, "rewards/margins": 0.3573591113090515, "rewards/rejected": -0.4571591913700104, "step": 660 }, { "epoch": 0.35060177917320773, "grad_norm": 83.50524399420453, "learning_rate": 4.995729315576468e-07, "logits/chosen": -2.6909804344177246, "logits/rejected": -2.654859781265259, "logps/chosen": -270.6436462402344, "logps/rejected": -242.53622436523438, "loss": 0.5697, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.08693110942840576, "rewards/margins": 0.42284971475601196, "rewards/rejected": -0.5097808241844177, "step": 670 }, { "epoch": 0.35583464154892724, "grad_norm": 56.509818255549085, "learning_rate": 4.99479357479016e-07, "logits/chosen": -2.649606943130493, "logits/rejected": -2.6005656719207764, "logps/chosen": -251.02987670898438, "logps/rejected": -230.5948486328125, "loss": 0.5576, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2845116853713989, "rewards/margins": 0.42809027433395386, "rewards/rejected": -0.7126020193099976, "step": 680 }, { "epoch": 0.36106750392464676, "grad_norm": 63.6023380966652, "learning_rate": 4.993765321174261e-07, "logits/chosen": -2.8176732063293457, "logits/rejected": -2.7134623527526855, "logps/chosen": -247.86801147460938, "logps/rejected": -237.22915649414062, "loss": 0.5227, "rewards/accuracies": 0.75, "rewards/chosen": 0.0951828807592392, "rewards/margins": 0.6302462816238403, "rewards/rejected": -0.5350633859634399, "step": 690 }, { "epoch": 0.3663003663003663, "grad_norm": 72.72474978480933, "learning_rate": 4.992644592858842e-07, "logits/chosen": -2.7131919860839844, "logits/rejected": -2.6791841983795166, "logps/chosen": -265.3692932128906, "logps/rejected": -248.10281372070312, "loss": 0.5801, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.20485153794288635, "rewards/margins": 0.4856719374656677, "rewards/rejected": -0.6905235052108765, "step": 700 }, { "epoch": 0.3715332286760858, "grad_norm": 77.60639105406315, "learning_rate": 4.991431431403148e-07, "logits/chosen": -2.802262783050537, "logits/rejected": -2.6851892471313477, "logps/chosen": -340.07659912109375, "logps/rejected": -285.58380126953125, "loss": 0.5271, "rewards/accuracies": 0.75, "rewards/chosen": -0.07976872473955154, "rewards/margins": 0.7483940720558167, "rewards/rejected": -0.8281628489494324, "step": 710 }, { "epoch": 0.37676609105180536, "grad_norm": 64.78758225968055, "learning_rate": 4.99012588179407e-07, "logits/chosen": -2.6652474403381348, "logits/rejected": -2.7214195728302, "logps/chosen": -220.8406219482422, "logps/rejected": -237.08779907226562, "loss": 0.5494, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.024497246369719505, "rewards/margins": 0.5303259491920471, "rewards/rejected": -0.5548231601715088, "step": 720 }, { "epoch": 0.3819989534275249, "grad_norm": 65.34881911690515, "learning_rate": 4.988727992444467e-07, "logits/chosen": -2.7291436195373535, "logits/rejected": -2.7527527809143066, "logps/chosen": -272.0304870605469, "logps/rejected": -307.57769775390625, "loss": 0.5785, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.24241900444030762, "rewards/margins": 0.6918538808822632, "rewards/rejected": -0.9342729449272156, "step": 730 }, { "epoch": 0.3872318158032444, "grad_norm": 68.99686104085876, "learning_rate": 4.98723781519137e-07, "logits/chosen": -2.7245595455169678, "logits/rejected": -2.7607216835021973, "logps/chosen": -246.9656524658203, "logps/rejected": -233.1698455810547, "loss": 0.5419, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11478378623723984, "rewards/margins": 0.5238386392593384, "rewards/rejected": -0.6386224031448364, "step": 740 }, { "epoch": 0.3924646781789639, "grad_norm": 73.8726217450531, "learning_rate": 4.98565540529407e-07, "logits/chosen": -2.645646572113037, "logits/rejected": -2.5875344276428223, "logps/chosen": -288.4378356933594, "logps/rejected": -307.13775634765625, "loss": 0.5328, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03566960245370865, "rewards/margins": 0.5397855639457703, "rewards/rejected": -0.5041159987449646, "step": 750 }, { "epoch": 0.3976975405546834, "grad_norm": 70.42082071557074, "learning_rate": 4.983980821432054e-07, "logits/chosen": -2.692359447479248, "logits/rejected": -2.6449592113494873, "logps/chosen": -238.6630859375, "logps/rejected": -204.46713256835938, "loss": 0.5833, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.18362995982170105, "rewards/margins": 0.5266052484512329, "rewards/rejected": -0.7102352380752563, "step": 760 }, { "epoch": 0.40293040293040294, "grad_norm": 59.90591958586741, "learning_rate": 4.982214125702845e-07, "logits/chosen": -2.688281297683716, "logits/rejected": -2.6551880836486816, "logps/chosen": -249.35751342773438, "logps/rejected": -288.7128601074219, "loss": 0.603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2397182285785675, "rewards/margins": 0.5740364789962769, "rewards/rejected": -0.8137545585632324, "step": 770 }, { "epoch": 0.40816326530612246, "grad_norm": 85.68097562700152, "learning_rate": 4.980355383619684e-07, "logits/chosen": -2.683443069458008, "logits/rejected": -2.6418633460998535, "logps/chosen": -244.8722381591797, "logps/rejected": -213.88235473632812, "loss": 0.5469, "rewards/accuracies": 0.75, "rewards/chosen": -0.2509283721446991, "rewards/margins": 0.6306599974632263, "rewards/rejected": -0.8815882802009583, "step": 780 }, { "epoch": 0.413396127681842, "grad_norm": 70.10204209462056, "learning_rate": 4.978404664109113e-07, "logits/chosen": -2.6783461570739746, "logits/rejected": -2.700972080230713, "logps/chosen": -231.48727416992188, "logps/rejected": -295.9053649902344, "loss": 0.5492, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.4138612747192383, "rewards/margins": 0.3425648808479309, "rewards/rejected": -0.7564261555671692, "step": 790 }, { "epoch": 0.4186289900575615, "grad_norm": 81.25028374863271, "learning_rate": 4.97636203950841e-07, "logits/chosen": -2.6948161125183105, "logits/rejected": -2.6822495460510254, "logps/chosen": -304.7908630371094, "logps/rejected": -309.82183837890625, "loss": 0.6007, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.19240470230579376, "rewards/margins": 0.5690978765487671, "rewards/rejected": -0.7615026235580444, "step": 800 }, { "epoch": 0.423861852433281, "grad_norm": 71.71281427481196, "learning_rate": 4.974227585562916e-07, "logits/chosen": -2.68057918548584, "logits/rejected": -2.581986427307129, "logps/chosen": -313.5508117675781, "logps/rejected": -278.8346862792969, "loss": 0.5493, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.42161816358566284, "rewards/margins": 0.6813873648643494, "rewards/rejected": -1.1030056476593018, "step": 810 }, { "epoch": 0.4290947148090005, "grad_norm": 54.56461253021572, "learning_rate": 4.972001381423214e-07, "logits/chosen": -2.74100399017334, "logits/rejected": -2.6623387336730957, "logps/chosen": -292.7416076660156, "logps/rejected": -246.9758758544922, "loss": 0.5516, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.38912060856819153, "rewards/margins": 0.729678213596344, "rewards/rejected": -1.118798851966858, "step": 820 }, { "epoch": 0.43432757718472004, "grad_norm": 76.67534708024763, "learning_rate": 4.969683509642206e-07, "logits/chosen": -2.8405652046203613, "logits/rejected": -2.7737975120544434, "logps/chosen": -243.72720336914062, "logps/rejected": -246.10107421875, "loss": 0.6121, "rewards/accuracies": 0.75, "rewards/chosen": -0.42866888642311096, "rewards/margins": 0.4650656282901764, "rewards/rejected": -0.8937345743179321, "step": 830 }, { "epoch": 0.43956043956043955, "grad_norm": 64.79089069454054, "learning_rate": 4.967274056172044e-07, "logits/chosen": -2.8649442195892334, "logits/rejected": -2.603626251220703, "logps/chosen": -413.15191650390625, "logps/rejected": -301.858642578125, "loss": 0.5475, "rewards/accuracies": 0.75, "rewards/chosen": -0.2905053496360779, "rewards/margins": 0.8141521215438843, "rewards/rejected": -1.1046574115753174, "step": 840 }, { "epoch": 0.44479330193615907, "grad_norm": 82.06458260690994, "learning_rate": 4.964773110360944e-07, "logits/chosen": -2.750856876373291, "logits/rejected": -2.5980868339538574, "logps/chosen": -262.92462158203125, "logps/rejected": -228.48593139648438, "loss": 0.5918, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.44581151008605957, "rewards/margins": 0.5122817754745483, "rewards/rejected": -0.9580932855606079, "step": 850 }, { "epoch": 0.4500261643118786, "grad_norm": 89.51335715472315, "learning_rate": 4.962180764949876e-07, "logits/chosen": -2.764763355255127, "logits/rejected": -2.7525763511657715, "logps/chosen": -194.84877014160156, "logps/rejected": -269.7641296386719, "loss": 0.562, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1448897421360016, "rewards/margins": 0.8955299258232117, "rewards/rejected": -1.040419578552246, "step": 860 }, { "epoch": 0.4552590266875981, "grad_norm": 58.74124593162302, "learning_rate": 4.959497116069122e-07, "logits/chosen": -2.4614791870117188, "logits/rejected": -2.5169665813446045, "logps/chosen": -229.3387908935547, "logps/rejected": -241.858154296875, "loss": 0.5677, "rewards/accuracies": 0.75, "rewards/chosen": -0.39239010214805603, "rewards/margins": 0.721211314201355, "rewards/rejected": -1.1136014461517334, "step": 870 }, { "epoch": 0.4604918890633176, "grad_norm": 77.15915949137676, "learning_rate": 4.956722263234711e-07, "logits/chosen": -2.7549219131469727, "logits/rejected": -2.714980363845825, "logps/chosen": -282.6243896484375, "logps/rejected": -263.4341735839844, "loss": 0.5399, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.48048511147499084, "rewards/margins": 0.4717450737953186, "rewards/rejected": -0.9522300958633423, "step": 880 }, { "epoch": 0.46572475143903713, "grad_norm": 73.9394228989263, "learning_rate": 4.95385630934473e-07, "logits/chosen": -2.8114538192749023, "logits/rejected": -2.7364768981933594, "logps/chosen": -319.81170654296875, "logps/rejected": -247.07186889648438, "loss": 0.5491, "rewards/accuracies": 0.75, "rewards/chosen": -0.2333296239376068, "rewards/margins": 0.6494348645210266, "rewards/rejected": -0.8827645182609558, "step": 890 }, { "epoch": 0.47095761381475665, "grad_norm": 96.7409696372864, "learning_rate": 4.950899360675511e-07, "logits/chosen": -2.672738552093506, "logits/rejected": -2.633599042892456, "logps/chosen": -256.09771728515625, "logps/rejected": -304.5975646972656, "loss": 0.5862, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6127587556838989, "rewards/margins": 0.8507426977157593, "rewards/rejected": -1.4635014533996582, "step": 900 }, { "epoch": 0.47619047619047616, "grad_norm": 86.8453977856647, "learning_rate": 4.947851526877681e-07, "logits/chosen": -2.6899325847625732, "logits/rejected": -2.6272430419921875, "logps/chosen": -185.05880737304688, "logps/rejected": -197.94110107421875, "loss": 0.5669, "rewards/accuracies": 0.75, "rewards/chosen": -0.34028100967407227, "rewards/margins": 0.9680425524711609, "rewards/rejected": -1.308323621749878, "step": 910 }, { "epoch": 0.48142333856619574, "grad_norm": 70.8964866934123, "learning_rate": 4.944712920972108e-07, "logits/chosen": -2.8275413513183594, "logits/rejected": -2.68281888961792, "logps/chosen": -314.7341613769531, "logps/rejected": -263.71209716796875, "loss": 0.5696, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4585978090763092, "rewards/margins": 0.6378489136695862, "rewards/rejected": -1.0964467525482178, "step": 920 }, { "epoch": 0.48665620094191525, "grad_norm": 77.19210699477038, "learning_rate": 4.9414836593457e-07, "logits/chosen": -2.6968119144439697, "logits/rejected": -2.6806297302246094, "logps/chosen": -274.0387268066406, "logps/rejected": -273.5841979980469, "loss": 0.5291, "rewards/accuracies": 0.5, "rewards/chosen": -0.5575527548789978, "rewards/margins": 0.4331057071685791, "rewards/rejected": -0.9906584620475769, "step": 930 }, { "epoch": 0.49188906331763477, "grad_norm": 63.91924634228564, "learning_rate": 4.938163861747094e-07, "logits/chosen": -2.7842488288879395, "logits/rejected": -2.6924023628234863, "logps/chosen": -305.1507568359375, "logps/rejected": -256.9998779296875, "loss": 0.5008, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4544626772403717, "rewards/margins": 0.8790037035942078, "rewards/rejected": -1.3334662914276123, "step": 940 }, { "epoch": 0.4971219256933543, "grad_norm": 74.90193395092072, "learning_rate": 4.934753651282215e-07, "logits/chosen": -2.7118842601776123, "logits/rejected": -2.5842690467834473, "logps/chosen": -295.0773620605469, "logps/rejected": -286.43194580078125, "loss": 0.5441, "rewards/accuracies": 0.75, "rewards/chosen": -0.3371061384677887, "rewards/margins": 0.9876976013183594, "rewards/rejected": -1.3248035907745361, "step": 950 }, { "epoch": 0.5023547880690737, "grad_norm": 68.4374675451541, "learning_rate": 4.93125315440971e-07, "logits/chosen": -2.7849133014678955, "logits/rejected": -2.670518159866333, "logps/chosen": -293.0834045410156, "logps/rejected": -283.2849426269531, "loss": 0.5556, "rewards/accuracies": 0.875, "rewards/chosen": -0.5373663902282715, "rewards/margins": 0.6889173984527588, "rewards/rejected": -1.2262837886810303, "step": 960 }, { "epoch": 0.5075876504447933, "grad_norm": 71.79203747934943, "learning_rate": 4.92766250093626e-07, "logits/chosen": -2.679919958114624, "logits/rejected": -2.5925917625427246, "logps/chosen": -302.0339050292969, "logps/rejected": -267.0357360839844, "loss": 0.5464, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4768869876861572, "rewards/margins": 1.1046160459518433, "rewards/rejected": -1.581502914428711, "step": 970 }, { "epoch": 0.5128205128205128, "grad_norm": 46.31017463847887, "learning_rate": 4.92398182401176e-07, "logits/chosen": -2.788491725921631, "logits/rejected": -2.5828099250793457, "logps/chosen": -315.658203125, "logps/rejected": -256.7175598144531, "loss": 0.5354, "rewards/accuracies": 0.875, "rewards/chosen": -0.16082218289375305, "rewards/margins": 1.0569546222686768, "rewards/rejected": -1.2177767753601074, "step": 980 }, { "epoch": 0.5180533751962323, "grad_norm": 59.880098009341154, "learning_rate": 4.920211260124395e-07, "logits/chosen": -2.675851583480835, "logits/rejected": -2.5794055461883545, "logps/chosen": -254.67105102539062, "logps/rejected": -239.4071044921875, "loss": 0.5319, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.3605723977088928, "rewards/margins": 0.8557994961738586, "rewards/rejected": -1.216371774673462, "step": 990 }, { "epoch": 0.5232862375719518, "grad_norm": 72.44078518479554, "learning_rate": 4.916350949095566e-07, "logits/chosen": -2.706974506378174, "logits/rejected": -2.6369376182556152, "logps/chosen": -247.0148468017578, "logps/rejected": -239.7918243408203, "loss": 0.557, "rewards/accuracies": 0.75, "rewards/chosen": -0.6575168371200562, "rewards/margins": 0.6040263772010803, "rewards/rejected": -1.2615431547164917, "step": 1000 }, { "epoch": 0.5285190999476713, "grad_norm": 65.88304223948026, "learning_rate": 4.912401034074708e-07, "logits/chosen": -2.6823890209198, "logits/rejected": -2.649326801300049, "logps/chosen": -243.98629760742188, "logps/rejected": -268.6001892089844, "loss": 0.5605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.3754280209541321, "rewards/margins": 0.6592222452163696, "rewards/rejected": -1.034650206565857, "step": 1010 }, { "epoch": 0.533751962323391, "grad_norm": 67.67480154154286, "learning_rate": 4.908361661533989e-07, "logits/chosen": -2.740328788757324, "logits/rejected": -2.6959586143493652, "logps/chosen": -302.77130126953125, "logps/rejected": -265.9853210449219, "loss": 0.5287, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.13270536065101624, "rewards/margins": 1.0353169441223145, "rewards/rejected": -1.1680222749710083, "step": 1020 }, { "epoch": 0.5389848246991105, "grad_norm": 63.344533946977315, "learning_rate": 4.904232981262866e-07, "logits/chosen": -2.7135579586029053, "logits/rejected": -2.6516432762145996, "logps/chosen": -269.3987121582031, "logps/rejected": -233.5369110107422, "loss": 0.5625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6198391318321228, "rewards/margins": 0.37689751386642456, "rewards/rejected": -0.9967366456985474, "step": 1030 }, { "epoch": 0.54421768707483, "grad_norm": 67.80712674562271, "learning_rate": 4.900015146362544e-07, "logits/chosen": -2.7513439655303955, "logits/rejected": -2.8110382556915283, "logps/chosen": -259.6036071777344, "logps/rejected": -279.2042541503906, "loss": 0.547, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4451045095920563, "rewards/margins": 0.7799221277236938, "rewards/rejected": -1.2250266075134277, "step": 1040 }, { "epoch": 0.5494505494505495, "grad_norm": 84.53833487464527, "learning_rate": 4.895708313240285e-07, "logits/chosen": -2.8216004371643066, "logits/rejected": -2.696685314178467, "logps/chosen": -341.482421875, "logps/rejected": -309.2986755371094, "loss": 0.5597, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.31436505913734436, "rewards/margins": 0.9949013590812683, "rewards/rejected": -1.3092663288116455, "step": 1050 }, { "epoch": 0.554683411826269, "grad_norm": 71.06241027666316, "learning_rate": 4.891312641603623e-07, "logits/chosen": -2.6985182762145996, "logits/rejected": -2.704181432723999, "logps/chosen": -273.2723693847656, "logps/rejected": -284.52557373046875, "loss": 0.5384, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1811438351869583, "rewards/margins": 1.0559108257293701, "rewards/rejected": -1.2370548248291016, "step": 1060 }, { "epoch": 0.5599162742019885, "grad_norm": 60.42257436255781, "learning_rate": 4.886828294454426e-07, "logits/chosen": -2.7313995361328125, "logits/rejected": -2.717003107070923, "logps/chosen": -338.99810791015625, "logps/rejected": -283.36285400390625, "loss": 0.5669, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2885645925998688, "rewards/margins": 0.6337456107139587, "rewards/rejected": -0.9223102331161499, "step": 1070 }, { "epoch": 0.565149136577708, "grad_norm": 65.79545292284378, "learning_rate": 4.882255438082863e-07, "logits/chosen": -2.7875468730926514, "logits/rejected": -2.7002549171447754, "logps/chosen": -242.84634399414062, "logps/rejected": -242.5367889404297, "loss": 0.5585, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3658864200115204, "rewards/margins": 0.8278558850288391, "rewards/rejected": -1.193742275238037, "step": 1080 }, { "epoch": 0.5703819989534276, "grad_norm": 107.1693968556409, "learning_rate": 4.877594242061233e-07, "logits/chosen": -2.7447891235351562, "logits/rejected": -2.5921199321746826, "logps/chosen": -318.3252258300781, "logps/rejected": -203.28018188476562, "loss": 0.595, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.646048903465271, "rewards/margins": 0.5135602951049805, "rewards/rejected": -1.1596091985702515, "step": 1090 }, { "epoch": 0.5756148613291471, "grad_norm": 62.65334429545523, "learning_rate": 4.87284487923768e-07, "logits/chosen": -2.6979050636291504, "logits/rejected": -2.6344146728515625, "logps/chosen": -273.78277587890625, "logps/rejected": -289.87689208984375, "loss": 0.52, "rewards/accuracies": 0.625, "rewards/chosen": -0.4887138307094574, "rewards/margins": 0.7608338594436646, "rewards/rejected": -1.2495477199554443, "step": 1100 }, { "epoch": 0.5808477237048666, "grad_norm": 75.4519203317871, "learning_rate": 4.868007525729775e-07, "logits/chosen": -2.473546028137207, "logits/rejected": -2.4677727222442627, "logps/chosen": -183.38070678710938, "logps/rejected": -213.664794921875, "loss": 0.589, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.3946594297885895, "rewards/margins": 0.7618762254714966, "rewards/rejected": -1.1565356254577637, "step": 1110 }, { "epoch": 0.5860805860805861, "grad_norm": 77.31972728639262, "learning_rate": 4.863082360917998e-07, "logits/chosen": -2.6780097484588623, "logits/rejected": -2.617384433746338, "logps/chosen": -287.62469482421875, "logps/rejected": -266.36541748046875, "loss": 0.5183, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.42290663719177246, "rewards/margins": 0.6520038843154907, "rewards/rejected": -1.0749104022979736, "step": 1120 }, { "epoch": 0.5913134484563056, "grad_norm": 81.30630179926432, "learning_rate": 4.858069567439072e-07, "logits/chosen": -2.590245246887207, "logits/rejected": -2.565310478210449, "logps/chosen": -232.45999145507812, "logps/rejected": -280.054931640625, "loss": 0.566, "rewards/accuracies": 0.625, "rewards/chosen": -0.8007476925849915, "rewards/margins": 0.5347889065742493, "rewards/rejected": -1.3355367183685303, "step": 1130 }, { "epoch": 0.5965463108320251, "grad_norm": 68.70456072258055, "learning_rate": 4.852969331179206e-07, "logits/chosen": -2.8644909858703613, "logits/rejected": -2.809370756149292, "logps/chosen": -266.9808349609375, "logps/rejected": -286.6571350097656, "loss": 0.5083, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2700379490852356, "rewards/margins": 0.9382151365280151, "rewards/rejected": -1.2082529067993164, "step": 1140 }, { "epoch": 0.6017791732077447, "grad_norm": 87.24229108451696, "learning_rate": 4.847781841267185e-07, "logits/chosen": -2.828613042831421, "logits/rejected": -2.640158176422119, "logps/chosen": -279.5322570800781, "logps/rejected": -249.63223266601562, "loss": 0.5443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4706583023071289, "rewards/margins": 0.6991799473762512, "rewards/rejected": -1.169838309288025, "step": 1150 }, { "epoch": 0.6070120355834642, "grad_norm": 70.40297431881797, "learning_rate": 4.842507290067374e-07, "logits/chosen": -2.5668742656707764, "logits/rejected": -2.577543258666992, "logps/chosen": -230.92886352539062, "logps/rejected": -199.5034942626953, "loss": 0.5571, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5253152251243591, "rewards/margins": 0.41240495443344116, "rewards/rejected": -0.9377201795578003, "step": 1160 }, { "epoch": 0.6122448979591837, "grad_norm": 78.31414773218133, "learning_rate": 4.837145873172567e-07, "logits/chosen": -2.739893913269043, "logits/rejected": -2.6579720973968506, "logps/chosen": -284.1490173339844, "logps/rejected": -297.1120300292969, "loss": 0.563, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.15587486326694489, "rewards/margins": 1.329700231552124, "rewards/rejected": -1.4855751991271973, "step": 1170 }, { "epoch": 0.6174777603349032, "grad_norm": 53.670817774925865, "learning_rate": 4.83169778939675e-07, "logits/chosen": -2.7941668033599854, "logits/rejected": -2.727989673614502, "logps/chosen": -326.0960693359375, "logps/rejected": -282.9150085449219, "loss": 0.5015, "rewards/accuracies": 0.625, "rewards/chosen": -0.48152121901512146, "rewards/margins": 0.5162063837051392, "rewards/rejected": -0.9977277517318726, "step": 1180 }, { "epoch": 0.6227106227106227, "grad_norm": 70.60560817856916, "learning_rate": 4.826163240767716e-07, "logits/chosen": -2.8005542755126953, "logits/rejected": -2.71057391166687, "logps/chosen": -372.0180969238281, "logps/rejected": -289.7819519042969, "loss": 0.4971, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12531840801239014, "rewards/margins": 0.7546466588973999, "rewards/rejected": -0.8799650073051453, "step": 1190 }, { "epoch": 0.6279434850863422, "grad_norm": 58.79565369701411, "learning_rate": 4.820542432519584e-07, "logits/chosen": -2.5247480869293213, "logits/rejected": -2.4183754920959473, "logps/chosen": -315.38555908203125, "logps/rejected": -283.47479248046875, "loss": 0.5096, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6097077131271362, "rewards/margins": 0.6779664158821106, "rewards/rejected": -1.2876741886138916, "step": 1200 }, { "epoch": 0.6331763474620618, "grad_norm": 56.823488768307215, "learning_rate": 4.814835573085176e-07, "logits/chosen": -2.818470001220703, "logits/rejected": -2.7555344104766846, "logps/chosen": -304.1570129394531, "logps/rejected": -276.91168212890625, "loss": 0.5567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4183574616909027, "rewards/margins": 0.8381322026252747, "rewards/rejected": -1.256489634513855, "step": 1210 }, { "epoch": 0.6384092098377813, "grad_norm": 75.5878427723409, "learning_rate": 4.809042874088304e-07, "logits/chosen": -2.78570556640625, "logits/rejected": -2.745189666748047, "logps/chosen": -333.1932373046875, "logps/rejected": -298.15252685546875, "loss": 0.5416, "rewards/accuracies": 0.75, "rewards/chosen": -0.5144246220588684, "rewards/margins": 0.9846269488334656, "rewards/rejected": -1.4990516901016235, "step": 1220 }, { "epoch": 0.6436420722135008, "grad_norm": 67.0851501322478, "learning_rate": 4.803164550335905e-07, "logits/chosen": -2.6935131549835205, "logits/rejected": -2.560260534286499, "logps/chosen": -352.7795104980469, "logps/rejected": -254.9468231201172, "loss": 0.4895, "rewards/accuracies": 0.75, "rewards/chosen": -0.4828592836856842, "rewards/margins": 1.221268892288208, "rewards/rejected": -1.7041280269622803, "step": 1230 }, { "epoch": 0.6488749345892203, "grad_norm": 55.066837550240976, "learning_rate": 4.797200819810089e-07, "logits/chosen": -2.7344272136688232, "logits/rejected": -2.714303493499756, "logps/chosen": -251.71859741210938, "logps/rejected": -230.49465942382812, "loss": 0.5232, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6297019720077515, "rewards/margins": 0.7004765272140503, "rewards/rejected": -1.3301784992218018, "step": 1240 }, { "epoch": 0.6541077969649398, "grad_norm": 70.88810338766321, "learning_rate": 4.79115190366005e-07, "logits/chosen": -2.7695717811584473, "logits/rejected": -2.6756415367126465, "logps/chosen": -281.10321044921875, "logps/rejected": -303.2259216308594, "loss": 0.5633, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5369859337806702, "rewards/margins": 0.8963001370429993, "rewards/rejected": -1.4332859516143799, "step": 1250 }, { "epoch": 0.6593406593406593, "grad_norm": 58.6326409397173, "learning_rate": 4.785018026193862e-07, "logits/chosen": -2.721715211868286, "logits/rejected": -2.6623756885528564, "logps/chosen": -276.13470458984375, "logps/rejected": -191.18539428710938, "loss": 0.5178, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.45906153321266174, "rewards/margins": 0.90235835313797, "rewards/rejected": -1.361419916152954, "step": 1260 }, { "epoch": 0.6645735217163788, "grad_norm": 64.32538103431956, "learning_rate": 4.77879941487017e-07, "logits/chosen": -2.6778359413146973, "logits/rejected": -2.616879463195801, "logps/chosen": -243.45571899414062, "logps/rejected": -230.1975860595703, "loss": 0.5098, "rewards/accuracies": 0.75, "rewards/chosen": -0.4738277792930603, "rewards/margins": 1.05690598487854, "rewards/rejected": -1.5307337045669556, "step": 1270 }, { "epoch": 0.6698063840920984, "grad_norm": 83.93310106998426, "learning_rate": 4.772496300289748e-07, "logits/chosen": -2.718397855758667, "logits/rejected": -2.5793282985687256, "logps/chosen": -258.0218200683594, "logps/rejected": -233.40280151367188, "loss": 0.5174, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.8180875778198242, "rewards/margins": 0.8874326944351196, "rewards/rejected": -1.7055202722549438, "step": 1280 }, { "epoch": 0.6750392464678179, "grad_norm": 71.21172285981052, "learning_rate": 4.766108916186949e-07, "logits/chosen": -2.6712429523468018, "logits/rejected": -2.6534574031829834, "logps/chosen": -252.097412109375, "logps/rejected": -296.68096923828125, "loss": 0.5653, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.21600313484668732, "rewards/margins": 1.2593408823013306, "rewards/rejected": -1.475343942642212, "step": 1290 }, { "epoch": 0.6802721088435374, "grad_norm": 70.06827063972005, "learning_rate": 4.759637499421042e-07, "logits/chosen": -2.6452338695526123, "logits/rejected": -2.6825003623962402, "logps/chosen": -275.8140563964844, "logps/rejected": -300.26690673828125, "loss": 0.5194, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6661376953125, "rewards/margins": 0.8043373823165894, "rewards/rejected": -1.4704749584197998, "step": 1300 }, { "epoch": 0.6855049712192569, "grad_norm": 80.34333468537173, "learning_rate": 4.7530822899674207e-07, "logits/chosen": -2.863783836364746, "logits/rejected": -2.825059413909912, "logps/chosen": -254.4219207763672, "logps/rejected": -225.2198486328125, "loss": 0.4917, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.39257654547691345, "rewards/margins": 1.1294041872024536, "rewards/rejected": -1.5219806432724, "step": 1310 }, { "epoch": 0.6907378335949764, "grad_norm": 101.81615459041603, "learning_rate": 4.7464435309087137e-07, "logits/chosen": -2.750309467315674, "logits/rejected": -2.73734712600708, "logps/chosen": -300.87109375, "logps/rejected": -313.13214111328125, "loss": 0.5185, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.704337477684021, "rewards/margins": 0.6578517556190491, "rewards/rejected": -1.3621892929077148, "step": 1320 }, { "epoch": 0.6959706959706959, "grad_norm": 71.48706237174991, "learning_rate": 4.739721468425763e-07, "logits/chosen": -2.741844654083252, "logits/rejected": -2.7405097484588623, "logps/chosen": -275.5357971191406, "logps/rejected": -317.1656188964844, "loss": 0.4807, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.26859548687934875, "rewards/margins": 1.2116283178329468, "rewards/rejected": -1.4802236557006836, "step": 1330 }, { "epoch": 0.7012035583464155, "grad_norm": 49.2715636482983, "learning_rate": 4.7329163517885e-07, "logits/chosen": -2.718933582305908, "logits/rejected": -2.6036744117736816, "logps/chosen": -272.68865966796875, "logps/rejected": -228.1563720703125, "loss": 0.4875, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.37356796860694885, "rewards/margins": 1.084763765335083, "rewards/rejected": -1.4583313465118408, "step": 1340 }, { "epoch": 0.706436420722135, "grad_norm": 70.78565545572422, "learning_rate": 4.7260284333466973e-07, "logits/chosen": -2.8037075996398926, "logits/rejected": -2.768467426300049, "logps/chosen": -298.2441711425781, "logps/rejected": -270.0771179199219, "loss": 0.5372, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5205308198928833, "rewards/margins": 0.7837169766426086, "rewards/rejected": -1.3042476177215576, "step": 1350 }, { "epoch": 0.7116692830978545, "grad_norm": 84.19956425522926, "learning_rate": 4.719057968520617e-07, "logits/chosen": -2.6150059700012207, "logits/rejected": -2.578220844268799, "logps/chosen": -350.07525634765625, "logps/rejected": -318.77899169921875, "loss": 0.5837, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7203130722045898, "rewards/margins": 0.6178886294364929, "rewards/rejected": -1.3382017612457275, "step": 1360 }, { "epoch": 0.716902145473574, "grad_norm": 71.32390552400585, "learning_rate": 4.7120052157915345e-07, "logits/chosen": -2.8357949256896973, "logits/rejected": -2.632596254348755, "logps/chosen": -324.78045654296875, "logps/rejected": -229.32681274414062, "loss": 0.4823, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5584132671356201, "rewards/margins": 0.9202592968940735, "rewards/rejected": -1.4786723852157593, "step": 1370 }, { "epoch": 0.7221350078492935, "grad_norm": 69.3531933754779, "learning_rate": 4.7048704366921537e-07, "logits/chosen": -2.802358627319336, "logits/rejected": -2.7255759239196777, "logps/chosen": -224.1294708251953, "logps/rejected": -263.2400207519531, "loss": 0.5116, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.37427574396133423, "rewards/margins": 1.2040784358978271, "rewards/rejected": -1.5783542394638062, "step": 1380 }, { "epoch": 0.727367870225013, "grad_norm": 57.566188366989095, "learning_rate": 4.6976538957969114e-07, "logits/chosen": -2.695610523223877, "logits/rejected": -2.584303379058838, "logps/chosen": -267.1957092285156, "logps/rejected": -235.9038543701172, "loss": 0.5166, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5423551201820374, "rewards/margins": 1.2531085014343262, "rewards/rejected": -1.7954635620117188, "step": 1390 }, { "epoch": 0.7326007326007326, "grad_norm": 75.33520016312441, "learning_rate": 4.690355860712163e-07, "logits/chosen": -2.690704822540283, "logits/rejected": -2.6887764930725098, "logps/chosen": -244.60501098632812, "logps/rejected": -269.39752197265625, "loss": 0.5501, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7962597608566284, "rewards/margins": 0.6051412224769592, "rewards/rejected": -1.4014009237289429, "step": 1400 }, { "epoch": 0.7378335949764521, "grad_norm": 84.04471122585693, "learning_rate": 4.682976602066262e-07, "logits/chosen": -2.6003170013427734, "logits/rejected": -2.5461487770080566, "logps/chosen": -260.6416931152344, "logps/rejected": -262.48638916015625, "loss": 0.5595, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6298843622207642, "rewards/margins": 1.3133777379989624, "rewards/rejected": -1.9432621002197266, "step": 1410 }, { "epoch": 0.7430664573521716, "grad_norm": 83.86573291558813, "learning_rate": 4.6755163934995224e-07, "logits/chosen": -2.7487921714782715, "logits/rejected": -2.669774055480957, "logps/chosen": -320.82354736328125, "logps/rejected": -268.43853759765625, "loss": 0.5686, "rewards/accuracies": 0.75, "rewards/chosen": -0.5370005965232849, "rewards/margins": 0.9020074009895325, "rewards/rejected": -1.4390079975128174, "step": 1420 }, { "epoch": 0.7482993197278912, "grad_norm": 55.25890313638634, "learning_rate": 4.667975511654072e-07, "logits/chosen": -2.7589454650878906, "logits/rejected": -2.6432929039001465, "logps/chosen": -304.049560546875, "logps/rejected": -270.5584411621094, "loss": 0.4897, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4467310309410095, "rewards/margins": 1.0153915882110596, "rewards/rejected": -1.4621226787567139, "step": 1430 }, { "epoch": 0.7535321821036107, "grad_norm": 84.87447605280406, "learning_rate": 4.660354236163595e-07, "logits/chosen": -2.8089873790740967, "logits/rejected": -2.702833414077759, "logps/chosen": -362.2886962890625, "logps/rejected": -322.1484680175781, "loss": 0.5516, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.03776503726840019, "rewards/margins": 1.153167486190796, "rewards/rejected": -1.1909326314926147, "step": 1440 }, { "epoch": 0.7587650444793302, "grad_norm": 62.630365097419194, "learning_rate": 4.6526528496429606e-07, "logits/chosen": -2.7678020000457764, "logits/rejected": -2.6640100479125977, "logps/chosen": -304.62579345703125, "logps/rejected": -283.0849609375, "loss": 0.5381, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6487807631492615, "rewards/margins": 1.1606749296188354, "rewards/rejected": -1.8094558715820312, "step": 1450 }, { "epoch": 0.7639979068550498, "grad_norm": 57.142971090558646, "learning_rate": 4.644871637677745e-07, "logits/chosen": -2.6548714637756348, "logits/rejected": -2.6578633785247803, "logps/chosen": -221.1512908935547, "logps/rejected": -237.13601684570312, "loss": 0.5584, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4943154752254486, "rewards/margins": 0.6672312021255493, "rewards/rejected": -1.1615464687347412, "step": 1460 }, { "epoch": 0.7692307692307693, "grad_norm": 56.71803507289668, "learning_rate": 4.637010888813638e-07, "logits/chosen": -2.7698307037353516, "logits/rejected": -2.6017284393310547, "logps/chosen": -339.74176025390625, "logps/rejected": -253.5137481689453, "loss": 0.4972, "rewards/accuracies": 0.75, "rewards/chosen": -0.47833189368247986, "rewards/margins": 0.9227139353752136, "rewards/rejected": -1.4010460376739502, "step": 1470 }, { "epoch": 0.7744636316064888, "grad_norm": 59.84525328670555, "learning_rate": 4.6290708945457493e-07, "logits/chosen": -2.658843517303467, "logits/rejected": -2.6468276977539062, "logps/chosen": -258.4879455566406, "logps/rejected": -245.56283569335938, "loss": 0.5689, "rewards/accuracies": 0.625, "rewards/chosen": -0.7929760217666626, "rewards/margins": 0.5606399178504944, "rewards/rejected": -1.3536159992218018, "step": 1480 }, { "epoch": 0.7796964939822083, "grad_norm": 80.8077698995965, "learning_rate": 4.6210519493077887e-07, "logits/chosen": -2.454429864883423, "logits/rejected": -2.4639267921447754, "logps/chosen": -291.4048156738281, "logps/rejected": -282.96368408203125, "loss": 0.5162, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7014742493629456, "rewards/margins": 0.7910742163658142, "rewards/rejected": -1.4925483465194702, "step": 1490 }, { "epoch": 0.7849293563579278, "grad_norm": 66.51030465502379, "learning_rate": 4.6129543504611607e-07, "logits/chosen": -2.6922767162323, "logits/rejected": -2.643317461013794, "logps/chosen": -220.99935913085938, "logps/rejected": -281.58038330078125, "loss": 0.4868, "rewards/accuracies": 0.625, "rewards/chosen": -0.8320234417915344, "rewards/margins": 1.1816871166229248, "rewards/rejected": -2.0137104988098145, "step": 1500 }, { "epoch": 0.7901622187336473, "grad_norm": 53.99248963391964, "learning_rate": 4.604778398283927e-07, "logits/chosen": -2.622396469116211, "logits/rejected": -2.6255924701690674, "logps/chosen": -271.9859619140625, "logps/rejected": -316.2346496582031, "loss": 0.5778, "rewards/accuracies": 0.625, "rewards/chosen": -1.1589412689208984, "rewards/margins": 0.8864221572875977, "rewards/rejected": -2.045363426208496, "step": 1510 }, { "epoch": 0.7953950811093669, "grad_norm": 56.49013069960226, "learning_rate": 4.596524395959678e-07, "logits/chosen": -2.6993722915649414, "logits/rejected": -2.637159824371338, "logps/chosen": -231.75247192382812, "logps/rejected": -268.9724426269531, "loss": 0.52, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6520730257034302, "rewards/margins": 1.2404626607894897, "rewards/rejected": -1.8925358057022095, "step": 1520 }, { "epoch": 0.8006279434850864, "grad_norm": 67.5153392153966, "learning_rate": 4.588192649566285e-07, "logits/chosen": -2.8271138668060303, "logits/rejected": -2.790332794189453, "logps/chosen": -330.3937683105469, "logps/rejected": -391.48492431640625, "loss": 0.489, "rewards/accuracies": 0.75, "rewards/chosen": -0.7549493312835693, "rewards/margins": 0.98554927110672, "rewards/rejected": -1.7404985427856445, "step": 1530 }, { "epoch": 0.8058608058608059, "grad_norm": 54.8196546474935, "learning_rate": 4.5797834680645553e-07, "logits/chosen": -2.734687566757202, "logits/rejected": -2.7426223754882812, "logps/chosen": -369.28692626953125, "logps/rejected": -329.3565979003906, "loss": 0.5567, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7556370496749878, "rewards/margins": 0.6318597793579102, "rewards/rejected": -1.3874967098236084, "step": 1540 }, { "epoch": 0.8110936682365254, "grad_norm": 63.54050063708392, "learning_rate": 4.5712971632867715e-07, "logits/chosen": -2.7076220512390137, "logits/rejected": -2.5632991790771484, "logps/chosen": -329.8143310546875, "logps/rejected": -240.5828094482422, "loss": 0.5079, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.22298932075500488, "rewards/margins": 1.1761170625686646, "rewards/rejected": -1.3991062641143799, "step": 1550 }, { "epoch": 0.8163265306122449, "grad_norm": 57.19800278535567, "learning_rate": 4.562734049925129e-07, "logits/chosen": -2.7164652347564697, "logits/rejected": -2.6165108680725098, "logps/chosen": -356.35986328125, "logps/rejected": -309.109130859375, "loss": 0.5143, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.5088120698928833, "rewards/margins": 1.126982569694519, "rewards/rejected": -1.6357946395874023, "step": 1560 }, { "epoch": 0.8215593929879644, "grad_norm": 64.23711870846407, "learning_rate": 4.5540944455200663e-07, "logits/chosen": -2.697077512741089, "logits/rejected": -2.6353626251220703, "logps/chosen": -244.924072265625, "logps/rejected": -262.2737121582031, "loss": 0.4999, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.5048820972442627, "rewards/margins": 1.0286469459533691, "rewards/rejected": -1.5335289239883423, "step": 1570 }, { "epoch": 0.826792255363684, "grad_norm": 76.683910640695, "learning_rate": 4.545378670448492e-07, "logits/chosen": -2.749056816101074, "logits/rejected": -2.5895638465881348, "logps/chosen": -298.3635559082031, "logps/rejected": -256.74139404296875, "loss": 0.5736, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8221967816352844, "rewards/margins": 0.8676072359085083, "rewards/rejected": -1.6898040771484375, "step": 1580 }, { "epoch": 0.8320251177394035, "grad_norm": 78.25471044424866, "learning_rate": 4.5365870479119014e-07, "logits/chosen": -2.612903356552124, "logits/rejected": -2.4850778579711914, "logps/chosen": -246.4439697265625, "logps/rejected": -225.95718383789062, "loss": 0.4848, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.502545177936554, "rewards/margins": 1.2864090204238892, "rewards/rejected": -1.7889540195465088, "step": 1590 }, { "epoch": 0.837257980115123, "grad_norm": 73.52834770469094, "learning_rate": 4.5277199039243917e-07, "logits/chosen": -2.6008079051971436, "logits/rejected": -2.623173475265503, "logps/chosen": -266.70184326171875, "logps/rejected": -297.88494873046875, "loss": 0.5043, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8646143078804016, "rewards/margins": 0.8069089651107788, "rewards/rejected": -1.6715233325958252, "step": 1600 }, { "epoch": 0.8424908424908425, "grad_norm": 72.95183490586152, "learning_rate": 4.5187775673005744e-07, "logits/chosen": -2.783461093902588, "logits/rejected": -2.638617753982544, "logps/chosen": -384.44561767578125, "logps/rejected": -337.8665466308594, "loss": 0.5219, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5897997617721558, "rewards/margins": 1.0438328981399536, "rewards/rejected": -1.6336326599121094, "step": 1610 }, { "epoch": 0.847723704866562, "grad_norm": 53.15633631946215, "learning_rate": 4.509760369643384e-07, "logits/chosen": -2.680973529815674, "logits/rejected": -2.54726243019104, "logps/chosen": -295.14544677734375, "logps/rejected": -250.4654998779297, "loss": 0.5654, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.894762396812439, "rewards/margins": 0.6794382333755493, "rewards/rejected": -1.5742003917694092, "step": 1620 }, { "epoch": 0.8529565672422815, "grad_norm": 71.14002291944531, "learning_rate": 4.5006686453317734e-07, "logits/chosen": -2.824927806854248, "logits/rejected": -2.827450752258301, "logps/chosen": -247.74349975585938, "logps/rejected": -260.1580505371094, "loss": 0.5422, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5920854806900024, "rewards/margins": 0.9330133199691772, "rewards/rejected": -1.5250988006591797, "step": 1630 }, { "epoch": 0.858189429618001, "grad_norm": 74.27481044176433, "learning_rate": 4.4915027315083243e-07, "logits/chosen": -2.720771074295044, "logits/rejected": -2.7052905559539795, "logps/chosen": -318.4162902832031, "logps/rejected": -299.65234375, "loss": 0.5582, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5697705745697021, "rewards/margins": 0.8112031817436218, "rewards/rejected": -1.3809736967086792, "step": 1640 }, { "epoch": 0.8634222919937206, "grad_norm": 71.63285416460687, "learning_rate": 4.482262968066737e-07, "logits/chosen": -2.69111704826355, "logits/rejected": -2.627052068710327, "logps/chosen": -290.1894836425781, "logps/rejected": -296.7138366699219, "loss": 0.5193, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6927599906921387, "rewards/margins": 0.6575680375099182, "rewards/rejected": -1.3503280878067017, "step": 1650 }, { "epoch": 0.8686551543694401, "grad_norm": 63.09467029844952, "learning_rate": 4.4729496976392324e-07, "logits/chosen": -2.6803653240203857, "logits/rejected": -2.628178596496582, "logps/chosen": -222.30813598632812, "logps/rejected": -258.65277099609375, "loss": 0.5362, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6566628813743591, "rewards/margins": 0.8711689114570618, "rewards/rejected": -1.5278319120407104, "step": 1660 }, { "epoch": 0.8738880167451596, "grad_norm": 88.87966503532546, "learning_rate": 4.463563265583843e-07, "logits/chosen": -2.821124315261841, "logits/rejected": -2.6980767250061035, "logps/chosen": -271.47381591796875, "logps/rejected": -271.1028747558594, "loss": 0.5337, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6489533185958862, "rewards/margins": 0.9798796772956848, "rewards/rejected": -1.6288330554962158, "step": 1670 }, { "epoch": 0.8791208791208791, "grad_norm": 62.47341360667123, "learning_rate": 4.4541040199716063e-07, "logits/chosen": -2.639287233352661, "logits/rejected": -2.6262917518615723, "logps/chosen": -263.83380126953125, "logps/rejected": -286.5048828125, "loss": 0.4636, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5826305150985718, "rewards/margins": 1.0568525791168213, "rewards/rejected": -1.639483094215393, "step": 1680 }, { "epoch": 0.8843537414965986, "grad_norm": 76.00018439668553, "learning_rate": 4.4445723115736587e-07, "logits/chosen": -2.6233866214752197, "logits/rejected": -2.572371244430542, "logps/chosen": -259.8497619628906, "logps/rejected": -245.5025634765625, "loss": 0.4983, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5224615335464478, "rewards/margins": 1.167961597442627, "rewards/rejected": -1.6904230117797852, "step": 1690 }, { "epoch": 0.8895866038723181, "grad_norm": 66.43267421558812, "learning_rate": 4.434968493848228e-07, "logits/chosen": -2.676736354827881, "logits/rejected": -2.6138041019439697, "logps/chosen": -284.3275146484375, "logps/rejected": -275.8121032714844, "loss": 0.5111, "rewards/accuracies": 0.625, "rewards/chosen": -0.8793582916259766, "rewards/margins": 0.8545160293579102, "rewards/rejected": -1.7338743209838867, "step": 1700 }, { "epoch": 0.8948194662480377, "grad_norm": 80.56001605676286, "learning_rate": 4.425292922927525e-07, "logits/chosen": -2.70340895652771, "logits/rejected": -2.6097865104675293, "logps/chosen": -342.5941162109375, "logps/rejected": -331.31744384765625, "loss": 0.521, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.5397204756736755, "rewards/margins": 0.9078131914138794, "rewards/rejected": -1.4475336074829102, "step": 1710 }, { "epoch": 0.9000523286237572, "grad_norm": 60.35160268438662, "learning_rate": 4.41554595760454e-07, "logits/chosen": -2.775596857070923, "logits/rejected": -2.607522487640381, "logps/chosen": -302.60076904296875, "logps/rejected": -264.5050354003906, "loss": 0.5385, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.9988101720809937, "rewards/margins": 0.4468079209327698, "rewards/rejected": -1.4456180334091187, "step": 1720 }, { "epoch": 0.9052851909994767, "grad_norm": 75.91645655507445, "learning_rate": 4.4057279593197326e-07, "logits/chosen": -2.7369158267974854, "logits/rejected": -2.657431125640869, "logps/chosen": -237.8176727294922, "logps/rejected": -199.45449829101562, "loss": 0.5123, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5934224724769592, "rewards/margins": 0.9565766453742981, "rewards/rejected": -1.5499989986419678, "step": 1730 }, { "epoch": 0.9105180533751962, "grad_norm": 87.85842673612727, "learning_rate": 4.395839292147637e-07, "logits/chosen": -2.774777412414551, "logits/rejected": -2.6052792072296143, "logps/chosen": -262.7558288574219, "logps/rejected": -229.07363891601562, "loss": 0.564, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9033422470092773, "rewards/margins": 0.7843619585037231, "rewards/rejected": -1.68770432472229, "step": 1740 }, { "epoch": 0.9157509157509157, "grad_norm": 69.63472701534312, "learning_rate": 4.3858803227833526e-07, "logits/chosen": -2.750851631164551, "logits/rejected": -2.7061922550201416, "logps/chosen": -333.9599914550781, "logps/rejected": -302.74261474609375, "loss": 0.5569, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.48318320512771606, "rewards/margins": 1.0502393245697021, "rewards/rejected": -1.5334227085113525, "step": 1750 }, { "epoch": 0.9209837781266352, "grad_norm": 77.24089380633716, "learning_rate": 4.375851420528951e-07, "logits/chosen": -2.7777843475341797, "logits/rejected": -2.7423300743103027, "logps/chosen": -234.72314453125, "logps/rejected": -225.20687866210938, "loss": 0.4943, "rewards/accuracies": 0.625, "rewards/chosen": -0.7239280343055725, "rewards/margins": 0.6287710070610046, "rewards/rejected": -1.3526990413665771, "step": 1760 }, { "epoch": 0.9262166405023547, "grad_norm": 73.03220449201594, "learning_rate": 4.36575295727978e-07, "logits/chosen": -2.6381800174713135, "logits/rejected": -2.5485033988952637, "logps/chosen": -306.7220764160156, "logps/rejected": -267.9762268066406, "loss": 0.5263, "rewards/accuracies": 0.75, "rewards/chosen": -0.7330409288406372, "rewards/margins": 0.9656373262405396, "rewards/rejected": -1.6986782550811768, "step": 1770 }, { "epoch": 0.9314495028780743, "grad_norm": 58.57471867811895, "learning_rate": 4.355585307510675e-07, "logits/chosen": -2.619809865951538, "logits/rejected": -2.5870203971862793, "logps/chosen": -253.3949432373047, "logps/rejected": -229.19570922851562, "loss": 0.5223, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8501235842704773, "rewards/margins": 0.9419552683830261, "rewards/rejected": -1.792078971862793, "step": 1780 }, { "epoch": 0.9366823652537938, "grad_norm": 76.61140880327854, "learning_rate": 4.345348848262068e-07, "logits/chosen": -2.713588237762451, "logits/rejected": -2.7371697425842285, "logps/chosen": -337.8150329589844, "logps/rejected": -341.7342224121094, "loss": 0.5213, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4116416573524475, "rewards/margins": 0.8930836915969849, "rewards/rejected": -1.3047252893447876, "step": 1790 }, { "epoch": 0.9419152276295133, "grad_norm": 84.84756785476401, "learning_rate": 4.33504395912601e-07, "logits/chosen": -2.5321755409240723, "logits/rejected": -2.4565305709838867, "logps/chosen": -244.6804962158203, "logps/rejected": -289.1626892089844, "loss": 0.5003, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.976446807384491, "rewards/margins": 1.38921058177948, "rewards/rejected": -2.365657329559326, "step": 1800 }, { "epoch": 0.9471480900052328, "grad_norm": 76.15234371104347, "learning_rate": 4.324671022232095e-07, "logits/chosen": -2.739748477935791, "logits/rejected": -2.6528263092041016, "logps/chosen": -266.6080017089844, "logps/rejected": -242.7993927001953, "loss": 0.5395, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6612841486930847, "rewards/margins": 1.1026535034179688, "rewards/rejected": -1.7639377117156982, "step": 1810 }, { "epoch": 0.9523809523809523, "grad_norm": 59.54173043288726, "learning_rate": 4.314230422233286e-07, "logits/chosen": -2.656968593597412, "logits/rejected": -2.564446449279785, "logps/chosen": -226.46499633789062, "logps/rejected": -199.26419067382812, "loss": 0.554, "rewards/accuracies": 0.75, "rewards/chosen": -0.7357887029647827, "rewards/margins": 0.8301893472671509, "rewards/rejected": -1.565977931022644, "step": 1820 }, { "epoch": 0.957613814756672, "grad_norm": 90.01723409563466, "learning_rate": 4.303722546291655e-07, "logits/chosen": -2.8147635459899902, "logits/rejected": -2.748579502105713, "logps/chosen": -293.80706787109375, "logps/rejected": -253.5705108642578, "loss": 0.5496, "rewards/accuracies": 0.625, "rewards/chosen": -0.6129995584487915, "rewards/margins": 0.715448260307312, "rewards/rejected": -1.3284478187561035, "step": 1830 }, { "epoch": 0.9628466771323915, "grad_norm": 66.06404656121876, "learning_rate": 4.2931477840640243e-07, "logits/chosen": -2.7455453872680664, "logits/rejected": -2.5494322776794434, "logps/chosen": -342.5234069824219, "logps/rejected": -287.9923095703125, "loss": 0.5235, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6869091391563416, "rewards/margins": 0.9585253000259399, "rewards/rejected": -1.6454343795776367, "step": 1840 }, { "epoch": 0.968079539508111, "grad_norm": 70.59339859610029, "learning_rate": 4.282506527687517e-07, "logits/chosen": -2.6523709297180176, "logits/rejected": -2.596681833267212, "logps/chosen": -384.2684326171875, "logps/rejected": -315.46728515625, "loss": 0.5384, "rewards/accuracies": 0.75, "rewards/chosen": -0.2603829503059387, "rewards/margins": 1.1419236660003662, "rewards/rejected": -1.4023066759109497, "step": 1850 }, { "epoch": 0.9733124018838305, "grad_norm": 56.435468927974156, "learning_rate": 4.271799171765016e-07, "logits/chosen": -2.6715855598449707, "logits/rejected": -2.499748706817627, "logps/chosen": -336.3532409667969, "logps/rejected": -253.0960693359375, "loss": 0.5253, "rewards/accuracies": 0.75, "rewards/chosen": -0.9096790552139282, "rewards/margins": 0.7897397875785828, "rewards/rejected": -1.6994187831878662, "step": 1860 }, { "epoch": 0.97854526425955, "grad_norm": 79.21208604111206, "learning_rate": 4.2610261133505323e-07, "logits/chosen": -2.7125673294067383, "logits/rejected": -2.6025753021240234, "logps/chosen": -257.30023193359375, "logps/rejected": -251.1166534423828, "loss": 0.5067, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5088145136833191, "rewards/margins": 0.9294927716255188, "rewards/rejected": -1.438307285308838, "step": 1870 }, { "epoch": 0.9837781266352695, "grad_norm": 67.10889828757449, "learning_rate": 4.250187751934479e-07, "logits/chosen": -2.759009838104248, "logits/rejected": -2.835824966430664, "logps/chosen": -262.1882019042969, "logps/rejected": -329.35089111328125, "loss": 0.513, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.34654396772384644, "rewards/margins": 0.9472799301147461, "rewards/rejected": -1.2938238382339478, "step": 1880 }, { "epoch": 0.989010989010989, "grad_norm": 68.91612684145302, "learning_rate": 4.2392844894288605e-07, "logits/chosen": -2.685486078262329, "logits/rejected": -2.623790740966797, "logps/chosen": -402.46978759765625, "logps/rejected": -347.0303039550781, "loss": 0.5157, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5702678561210632, "rewards/margins": 0.7701340913772583, "rewards/rejected": -1.3404020071029663, "step": 1890 }, { "epoch": 0.9942438513867086, "grad_norm": 63.01428510274556, "learning_rate": 4.2283167301523634e-07, "logits/chosen": -2.8204257488250732, "logits/rejected": -2.680922508239746, "logps/chosen": -238.2408447265625, "logps/rejected": -238.68704223632812, "loss": 0.5127, "rewards/accuracies": 0.75, "rewards/chosen": -0.9641359448432922, "rewards/margins": 0.9542892575263977, "rewards/rejected": -1.91842520236969, "step": 1900 }, { "epoch": 0.9994767137624281, "grad_norm": 82.43417623887333, "learning_rate": 4.217284880815369e-07, "logits/chosen": -2.667724370956421, "logits/rejected": -2.6773922443389893, "logps/chosen": -338.7535705566406, "logps/rejected": -349.4644470214844, "loss": 0.5238, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.45123291015625, "rewards/margins": 1.566555380821228, "rewards/rejected": -2.0177884101867676, "step": 1910 }, { "epoch": 1.0047095761381475, "grad_norm": 41.11062818319924, "learning_rate": 4.2061893505048694e-07, "logits/chosen": -2.667792558670044, "logits/rejected": -2.6502389907836914, "logps/chosen": -202.4060821533203, "logps/rejected": -271.0544128417969, "loss": 0.1721, "rewards/accuracies": 0.875, "rewards/chosen": 0.08865700662136078, "rewards/margins": 2.546302318572998, "rewards/rejected": -2.4576454162597656, "step": 1920 }, { "epoch": 1.0099424385138671, "grad_norm": 16.348882259373745, "learning_rate": 4.1950305506692967e-07, "logits/chosen": -2.8264620304107666, "logits/rejected": -2.6654040813446045, "logps/chosen": -318.1976013183594, "logps/rejected": -296.55914306640625, "loss": 0.1149, "rewards/accuracies": 1.0, "rewards/chosen": 1.2823026180267334, "rewards/margins": 4.868466377258301, "rewards/rejected": -3.5861637592315674, "step": 1930 }, { "epoch": 1.0151753008895865, "grad_norm": 20.175971999612642, "learning_rate": 4.1838088951032656e-07, "logits/chosen": -2.559612989425659, "logits/rejected": -2.506347179412842, "logps/chosen": -345.3162536621094, "logps/rejected": -316.6770935058594, "loss": 0.1262, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7579983472824097, "rewards/margins": 4.48593807220459, "rewards/rejected": -3.7279388904571533, "step": 1940 }, { "epoch": 1.0204081632653061, "grad_norm": 15.520890307252973, "learning_rate": 4.172524799932231e-07, "logits/chosen": -2.6519670486450195, "logits/rejected": -2.6060433387756348, "logps/chosen": -217.2434539794922, "logps/rejected": -279.20989990234375, "loss": 0.1184, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.11215732246637344, "rewards/margins": 3.2862472534179688, "rewards/rejected": -3.1740899085998535, "step": 1950 }, { "epoch": 1.0256410256410255, "grad_norm": 13.628134487221653, "learning_rate": 4.161178683597054e-07, "logits/chosen": -2.804748773574829, "logits/rejected": -2.621863842010498, "logps/chosen": -257.8443603515625, "logps/rejected": -234.2767333984375, "loss": 0.0981, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.13752229511737823, "rewards/margins": 3.750183582305908, "rewards/rejected": -3.612661361694336, "step": 1960 }, { "epoch": 1.0308738880167452, "grad_norm": 24.172632805742488, "learning_rate": 4.1497709668384885e-07, "logits/chosen": -2.7590997219085693, "logits/rejected": -2.6684045791625977, "logps/chosen": -342.3634338378906, "logps/rejected": -321.36480712890625, "loss": 0.1018, "rewards/accuracies": 1.0, "rewards/chosen": 0.8379875421524048, "rewards/margins": 4.788851737976074, "rewards/rejected": -3.95086407661438, "step": 1970 }, { "epoch": 1.0361067503924646, "grad_norm": 33.73220087524629, "learning_rate": 4.1383020726815745e-07, "logits/chosen": -2.7631189823150635, "logits/rejected": -2.651642084121704, "logps/chosen": -245.7153778076172, "logps/rejected": -274.5694885253906, "loss": 0.1152, "rewards/accuracies": 0.875, "rewards/chosen": -0.14847412705421448, "rewards/margins": 3.823739528656006, "rewards/rejected": -3.9722137451171875, "step": 1980 }, { "epoch": 1.0413396127681842, "grad_norm": 22.264648453297795, "learning_rate": 4.126772426419959e-07, "logits/chosen": -2.642127275466919, "logits/rejected": -2.661806583404541, "logps/chosen": -261.2215881347656, "logps/rejected": -302.48541259765625, "loss": 0.1445, "rewards/accuracies": 0.875, "rewards/chosen": -0.09080305695533752, "rewards/margins": 3.6442935466766357, "rewards/rejected": -3.7350971698760986, "step": 1990 }, { "epoch": 1.0465724751439036, "grad_norm": 13.044758008413115, "learning_rate": 4.1151824556001145e-07, "logits/chosen": -2.7277259826660156, "logits/rejected": -2.6704182624816895, "logps/chosen": -231.52023315429688, "logps/rejected": -295.1512145996094, "loss": 0.1219, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.004637342877686024, "rewards/margins": 4.0213823318481445, "rewards/rejected": -4.016745090484619, "step": 2000 }, { "epoch": 1.0465724751439036, "eval_logits/chosen": -2.681251049041748, "eval_logits/rejected": -2.62642502784729, "eval_logps/chosen": -280.007568359375, "eval_logps/rejected": -295.7981872558594, "eval_loss": 0.5597859025001526, "eval_rewards/accuracies": 0.75390625, "eval_rewards/chosen": -1.2750576734542847, "eval_rewards/margins": 1.320380449295044, "eval_rewards/rejected": -2.595438003540039, "eval_runtime": 101.4599, "eval_samples_per_second": 19.712, "eval_steps_per_second": 0.315, "step": 2000 }, { "epoch": 1.0518053375196232, "grad_norm": 5.757416919621772, "learning_rate": 4.103532590005495e-07, "logits/chosen": -2.795039653778076, "logits/rejected": -2.6831464767456055, "logps/chosen": -273.0514831542969, "logps/rejected": -247.02188110351562, "loss": 0.0915, "rewards/accuracies": 1.0, "rewards/chosen": 0.48246365785598755, "rewards/margins": 4.765566825866699, "rewards/rejected": -4.283102512359619, "step": 2010 }, { "epoch": 1.0570381998953426, "grad_norm": 14.083494320034111, "learning_rate": 4.091823261640592e-07, "logits/chosen": -2.7425522804260254, "logits/rejected": -2.6334598064422607, "logps/chosen": -253.7746124267578, "logps/rejected": -252.0037078857422, "loss": 0.117, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.34954172372817993, "rewards/margins": 4.962024211883545, "rewards/rejected": -4.612483024597168, "step": 2020 }, { "epoch": 1.0622710622710623, "grad_norm": 19.89811596162904, "learning_rate": 4.080054904714917e-07, "logits/chosen": -2.693772792816162, "logits/rejected": -2.6306285858154297, "logps/chosen": -240.82015991210938, "logps/rejected": -271.550048828125, "loss": 0.106, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5872729420661926, "rewards/margins": 3.7091548442840576, "rewards/rejected": -4.296427249908447, "step": 2030 }, { "epoch": 1.0675039246467817, "grad_norm": 18.48630840724053, "learning_rate": 4.0682279556268993e-07, "logits/chosen": -2.687135696411133, "logits/rejected": -2.6527206897735596, "logps/chosen": -321.5030822753906, "logps/rejected": -355.2987365722656, "loss": 0.1024, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.37303978204727173, "rewards/margins": 5.5488667488098145, "rewards/rejected": -5.175827503204346, "step": 2040 }, { "epoch": 1.0727367870225013, "grad_norm": 72.20430790189955, "learning_rate": 4.056342852947706e-07, "logits/chosen": -2.857966661453247, "logits/rejected": -2.657653331756592, "logps/chosen": -358.5530700683594, "logps/rejected": -333.0049743652344, "loss": 0.1054, "rewards/accuracies": 1.0, "rewards/chosen": 0.41347384452819824, "rewards/margins": 5.176710605621338, "rewards/rejected": -4.763236999511719, "step": 2050 }, { "epoch": 1.077969649398221, "grad_norm": 15.825306334222, "learning_rate": 4.044400037404973e-07, "logits/chosen": -2.7194700241088867, "logits/rejected": -2.6635639667510986, "logps/chosen": -209.5430450439453, "logps/rejected": -231.8815155029297, "loss": 0.0934, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1871255487203598, "rewards/margins": 3.7858691215515137, "rewards/rejected": -3.598743438720703, "step": 2060 }, { "epoch": 1.0832025117739403, "grad_norm": 24.60722719393446, "learning_rate": 4.032399951866468e-07, "logits/chosen": -2.6166274547576904, "logits/rejected": -2.493044853210449, "logps/chosen": -220.2571563720703, "logps/rejected": -224.3196563720703, "loss": 0.1081, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.0679265633225441, "rewards/margins": 3.6435153484344482, "rewards/rejected": -3.7114417552948, "step": 2070 }, { "epoch": 1.08843537414966, "grad_norm": 11.124081035150457, "learning_rate": 4.0203430413236637e-07, "logits/chosen": -2.7528700828552246, "logits/rejected": -2.6914639472961426, "logps/chosen": -290.67840576171875, "logps/rejected": -337.6015625, "loss": 0.1114, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.13888368010520935, "rewards/margins": 4.532259464263916, "rewards/rejected": -4.393376350402832, "step": 2080 }, { "epoch": 1.0936682365253794, "grad_norm": 26.233852464976422, "learning_rate": 4.0082297528752407e-07, "logits/chosen": -2.6731064319610596, "logits/rejected": -2.5635428428649902, "logps/chosen": -190.65933227539062, "logps/rejected": -245.5774688720703, "loss": 0.1073, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.025271248072385788, "rewards/margins": 4.773611545562744, "rewards/rejected": -4.798882484436035, "step": 2090 }, { "epoch": 1.098901098901099, "grad_norm": 26.69458051765813, "learning_rate": 3.9960605357105e-07, "logits/chosen": -2.754662275314331, "logits/rejected": -2.672816753387451, "logps/chosen": -271.67108154296875, "logps/rejected": -290.9272155761719, "loss": 0.1017, "rewards/accuracies": 0.875, "rewards/chosen": -0.26762229204177856, "rewards/margins": 4.417377471923828, "rewards/rejected": -4.684999465942383, "step": 2100 }, { "epoch": 1.1041339612768184, "grad_norm": 15.294518524435599, "learning_rate": 3.983835841092716e-07, "logits/chosen": -2.7272603511810303, "logits/rejected": -2.5137414932250977, "logps/chosen": -303.4950256347656, "logps/rejected": -244.9526824951172, "loss": 0.112, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.03372827544808388, "rewards/margins": 4.539273262023926, "rewards/rejected": -4.573000907897949, "step": 2110 }, { "epoch": 1.109366823652538, "grad_norm": 22.579864784334834, "learning_rate": 3.971556122342398e-07, "logits/chosen": -2.7080860137939453, "logits/rejected": -2.598214626312256, "logps/chosen": -262.7431945800781, "logps/rejected": -255.9542999267578, "loss": 0.1188, "rewards/accuracies": 1.0, "rewards/chosen": -0.2957598567008972, "rewards/margins": 3.6235549449920654, "rewards/rejected": -3.9193148612976074, "step": 2120 }, { "epoch": 1.1145996860282574, "grad_norm": 32.3871406216201, "learning_rate": 3.9592218348204766e-07, "logits/chosen": -2.754770278930664, "logits/rejected": -2.6260383129119873, "logps/chosen": -282.0311584472656, "logps/rejected": -283.6141052246094, "loss": 0.094, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.2089064121246338, "rewards/margins": 3.732781171798706, "rewards/rejected": -3.9416871070861816, "step": 2130 }, { "epoch": 1.119832548403977, "grad_norm": 26.50872335052292, "learning_rate": 3.946833435911423e-07, "logits/chosen": -2.7639455795288086, "logits/rejected": -2.5854954719543457, "logps/chosen": -241.7471923828125, "logps/rejected": -255.3394775390625, "loss": 0.1284, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.10643292963504791, "rewards/margins": 5.131852149963379, "rewards/rejected": -5.025418281555176, "step": 2140 }, { "epoch": 1.1250654107796965, "grad_norm": 13.796588381910421, "learning_rate": 3.9343913850062856e-07, "logits/chosen": -2.653146982192993, "logits/rejected": -2.7795047760009766, "logps/chosen": -219.7615966796875, "logps/rejected": -321.1592712402344, "loss": 0.1194, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6270453929901123, "rewards/margins": 4.35850715637207, "rewards/rejected": -4.985552787780762, "step": 2150 }, { "epoch": 1.130298273155416, "grad_norm": 18.129359689281564, "learning_rate": 3.921896143485657e-07, "logits/chosen": -2.6613283157348633, "logits/rejected": -2.5742406845092773, "logps/chosen": -274.45330810546875, "logps/rejected": -293.07037353515625, "loss": 0.1367, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5230139493942261, "rewards/margins": 4.180461883544922, "rewards/rejected": -4.703475475311279, "step": 2160 }, { "epoch": 1.1355311355311355, "grad_norm": 31.59182203614497, "learning_rate": 3.9093481747025615e-07, "logits/chosen": -2.8258137702941895, "logits/rejected": -2.696892023086548, "logps/chosen": -304.492919921875, "logps/rejected": -299.60235595703125, "loss": 0.1077, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.1446405053138733, "rewards/margins": 4.210850715637207, "rewards/rejected": -4.3554911613464355, "step": 2170 }, { "epoch": 1.1407639979068551, "grad_norm": 16.54672073873017, "learning_rate": 3.896747943965275e-07, "logits/chosen": -2.798146963119507, "logits/rejected": -2.6052534580230713, "logps/chosen": -246.67373657226562, "logps/rejected": -273.1455993652344, "loss": 0.1118, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.24677284061908722, "rewards/margins": 4.432964324951172, "rewards/rejected": -4.679737567901611, "step": 2180 }, { "epoch": 1.1459968602825745, "grad_norm": 11.0165801283736, "learning_rate": 3.8840959185200717e-07, "logits/chosen": -2.6495652198791504, "logits/rejected": -2.696471691131592, "logps/chosen": -264.830322265625, "logps/rejected": -284.7891540527344, "loss": 0.1036, "rewards/accuracies": 0.875, "rewards/chosen": 0.07012466341257095, "rewards/margins": 4.172719955444336, "rewards/rejected": -4.102595329284668, "step": 2190 }, { "epoch": 1.1512297226582942, "grad_norm": 25.08165850770016, "learning_rate": 3.871392567533893e-07, "logits/chosen": -2.758479595184326, "logits/rejected": -2.6219115257263184, "logps/chosen": -309.43341064453125, "logps/rejected": -308.2267761230469, "loss": 0.0932, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03831760957837105, "rewards/margins": 4.258543968200684, "rewards/rejected": -4.220226764678955, "step": 2200 }, { "epoch": 1.1564625850340136, "grad_norm": 19.945909449447672, "learning_rate": 3.858638362076953e-07, "logits/chosen": -2.649031162261963, "logits/rejected": -2.521435260772705, "logps/chosen": -267.05474853515625, "logps/rejected": -279.1736755371094, "loss": 0.0981, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2997003197669983, "rewards/margins": 4.197286128997803, "rewards/rejected": -3.8975861072540283, "step": 2210 }, { "epoch": 1.1616954474097332, "grad_norm": 8.116785041619545, "learning_rate": 3.845833775105272e-07, "logits/chosen": -2.716078996658325, "logits/rejected": -2.6938929557800293, "logps/chosen": -250.26473999023438, "logps/rejected": -304.3114013671875, "loss": 0.0701, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.47546157240867615, "rewards/margins": 4.779170989990234, "rewards/rejected": -4.303709030151367, "step": 2220 }, { "epoch": 1.1669283097854526, "grad_norm": 52.93274090068003, "learning_rate": 3.832979281443133e-07, "logits/chosen": -2.7947661876678467, "logits/rejected": -2.7615036964416504, "logps/chosen": -251.92333984375, "logps/rejected": -279.75067138671875, "loss": 0.1208, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.4818398952484131, "rewards/margins": 4.38036584854126, "rewards/rejected": -3.8985257148742676, "step": 2230 }, { "epoch": 1.1721611721611722, "grad_norm": 14.730493618178814, "learning_rate": 3.8200753577654765e-07, "logits/chosen": -2.7494163513183594, "logits/rejected": -2.624601364135742, "logps/chosen": -240.26797485351562, "logps/rejected": -286.29998779296875, "loss": 0.12, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.16159066557884216, "rewards/margins": 4.487453460693359, "rewards/rejected": -4.649044513702393, "step": 2240 }, { "epoch": 1.1773940345368916, "grad_norm": 20.5831665916214, "learning_rate": 3.8071224825802273e-07, "logits/chosen": -2.821059226989746, "logits/rejected": -2.809110641479492, "logps/chosen": -298.7447814941406, "logps/rejected": -364.2626647949219, "loss": 0.0916, "rewards/accuracies": 1.0, "rewards/chosen": 0.017295408993959427, "rewards/margins": 4.2460737228393555, "rewards/rejected": -4.228778839111328, "step": 2250 }, { "epoch": 1.1826268969126112, "grad_norm": 21.38960333109125, "learning_rate": 3.7941211362105453e-07, "logits/chosen": -2.8299174308776855, "logits/rejected": -2.7152066230773926, "logps/chosen": -307.16558837890625, "logps/rejected": -360.36529541015625, "loss": 0.1011, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2989030182361603, "rewards/margins": 4.798842430114746, "rewards/rejected": -4.499939918518066, "step": 2260 }, { "epoch": 1.1878597592883307, "grad_norm": 24.962369035454596, "learning_rate": 3.781071800777017e-07, "logits/chosen": -2.6308321952819824, "logits/rejected": -2.6083009243011475, "logps/chosen": -299.7275390625, "logps/rejected": -337.92413330078125, "loss": 0.105, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05873597413301468, "rewards/margins": 5.657949924468994, "rewards/rejected": -5.599213123321533, "step": 2270 }, { "epoch": 1.1930926216640503, "grad_norm": 30.89580342581049, "learning_rate": 3.767974960179776e-07, "logits/chosen": -2.758455514907837, "logits/rejected": -2.7137277126312256, "logps/chosen": -249.8431854248047, "logps/rejected": -286.2932434082031, "loss": 0.0806, "rewards/accuracies": 1.0, "rewards/chosen": -0.5361717939376831, "rewards/margins": 4.667669773101807, "rewards/rejected": -5.203841686248779, "step": 2280 }, { "epoch": 1.1983254840397697, "grad_norm": 30.283751234272458, "learning_rate": 3.7548311000805605e-07, "logits/chosen": -2.6532771587371826, "logits/rejected": -2.6755683422088623, "logps/chosen": -265.91107177734375, "logps/rejected": -353.2947692871094, "loss": 0.1133, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.3525296151638031, "rewards/margins": 4.7483320236206055, "rewards/rejected": -5.100861549377441, "step": 2290 }, { "epoch": 1.2035583464154893, "grad_norm": 9.859761545067276, "learning_rate": 3.7416407078847015e-07, "logits/chosen": -2.8084030151367188, "logits/rejected": -2.7766425609588623, "logps/chosen": -293.06781005859375, "logps/rejected": -352.6015319824219, "loss": 0.1005, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.38035669922828674, "rewards/margins": 4.947328567504883, "rewards/rejected": -5.327685356140137, "step": 2300 }, { "epoch": 1.2087912087912087, "grad_norm": 18.03917985289818, "learning_rate": 3.7284042727230506e-07, "logits/chosen": -2.787038803100586, "logits/rejected": -2.644801616668701, "logps/chosen": -216.9444122314453, "logps/rejected": -270.38037109375, "loss": 0.1084, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.41174182295799255, "rewards/margins": 5.080317497253418, "rewards/rejected": -5.492059230804443, "step": 2310 }, { "epoch": 1.2140240711669283, "grad_norm": 23.654847155769655, "learning_rate": 3.7151222854338413e-07, "logits/chosen": -2.8254880905151367, "logits/rejected": -2.6135454177856445, "logps/chosen": -311.6871032714844, "logps/rejected": -318.1632080078125, "loss": 0.1159, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1879824846982956, "rewards/margins": 5.800552845001221, "rewards/rejected": -5.612570762634277, "step": 2320 }, { "epoch": 1.2192569335426477, "grad_norm": 35.04893748896877, "learning_rate": 3.701795238544488e-07, "logits/chosen": -2.763002395629883, "logits/rejected": -2.6701645851135254, "logps/chosen": -298.36529541015625, "logps/rejected": -325.75146484375, "loss": 0.1037, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.038963984698057175, "rewards/margins": 5.101754665374756, "rewards/rejected": -5.062790870666504, "step": 2330 }, { "epoch": 1.2244897959183674, "grad_norm": 7.125936445736221, "learning_rate": 3.688423626253318e-07, "logits/chosen": -2.615354299545288, "logits/rejected": -2.674558401107788, "logps/chosen": -211.46652221679688, "logps/rejected": -271.6877136230469, "loss": 0.1071, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3125256896018982, "rewards/margins": 4.879128456115723, "rewards/rejected": -5.191654682159424, "step": 2340 }, { "epoch": 1.2297226582940868, "grad_norm": 21.624796429967716, "learning_rate": 3.675007944411253e-07, "logits/chosen": -2.78330659866333, "logits/rejected": -2.6863913536071777, "logps/chosen": -288.99591064453125, "logps/rejected": -279.0249938964844, "loss": 0.1455, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.3831740617752075, "rewards/margins": 4.454309940338135, "rewards/rejected": -4.071135520935059, "step": 2350 }, { "epoch": 1.2349555206698064, "grad_norm": 27.255995328708035, "learning_rate": 3.6615486905034167e-07, "logits/chosen": -2.7918009757995605, "logits/rejected": -2.698138475418091, "logps/chosen": -304.58099365234375, "logps/rejected": -295.0577392578125, "loss": 0.104, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02554398775100708, "rewards/margins": 4.034098148345947, "rewards/rejected": -4.008553981781006, "step": 2360 }, { "epoch": 1.2401883830455258, "grad_norm": 61.76929752931291, "learning_rate": 3.6480463636306846e-07, "logits/chosen": -2.792266368865967, "logits/rejected": -2.713168144226074, "logps/chosen": -319.86151123046875, "logps/rejected": -338.32769775390625, "loss": 0.1403, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04835640266537666, "rewards/margins": 4.156170845031738, "rewards/rejected": -4.204527854919434, "step": 2370 }, { "epoch": 1.2454212454212454, "grad_norm": 15.002806398318375, "learning_rate": 3.634501464491183e-07, "logits/chosen": -2.7825894355773926, "logits/rejected": -2.7090325355529785, "logps/chosen": -247.15719604492188, "logps/rejected": -306.4307556152344, "loss": 0.088, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2551979422569275, "rewards/margins": 4.776516437530518, "rewards/rejected": -4.5213189125061035, "step": 2380 }, { "epoch": 1.250654107796965, "grad_norm": 13.601330406207861, "learning_rate": 3.6209144953617175e-07, "logits/chosen": -2.5528080463409424, "logits/rejected": -2.5722885131835938, "logps/chosen": -350.04766845703125, "logps/rejected": -439.79510498046875, "loss": 0.0795, "rewards/accuracies": 1.0, "rewards/chosen": 0.187852144241333, "rewards/margins": 5.684319496154785, "rewards/rejected": -5.496466636657715, "step": 2390 }, { "epoch": 1.2558869701726845, "grad_norm": 31.574093609154257, "learning_rate": 3.607285960079146e-07, "logits/chosen": -2.7976462841033936, "logits/rejected": -2.683702230453491, "logps/chosen": -323.38592529296875, "logps/rejected": -345.6888122558594, "loss": 0.1042, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.437208890914917, "rewards/margins": 5.890275001525879, "rewards/rejected": -5.453065395355225, "step": 2400 }, { "epoch": 1.2611198325484039, "grad_norm": 36.854098923875576, "learning_rate": 3.593616364021701e-07, "logits/chosen": -2.840149402618408, "logits/rejected": -2.7021851539611816, "logps/chosen": -299.9290771484375, "logps/rejected": -336.386962890625, "loss": 0.1085, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.1426982879638672, "rewards/margins": 4.883759021759033, "rewards/rejected": -5.0264573097229, "step": 2410 }, { "epoch": 1.2663526949241235, "grad_norm": 39.2447615051275, "learning_rate": 3.5799062140902413e-07, "logits/chosen": -2.70076322555542, "logits/rejected": -2.5660500526428223, "logps/chosen": -319.2273254394531, "logps/rejected": -310.43475341796875, "loss": 0.1012, "rewards/accuracies": 1.0, "rewards/chosen": 0.06131463497877121, "rewards/margins": 5.338719367980957, "rewards/rejected": -5.277405261993408, "step": 2420 }, { "epoch": 1.2715855572998431, "grad_norm": 25.822343644852324, "learning_rate": 3.566156018689462e-07, "logits/chosen": -2.757275342941284, "logits/rejected": -2.4784798622131348, "logps/chosen": -279.9647521972656, "logps/rejected": -255.85922241210938, "loss": 0.1481, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0078494548797607, "rewards/margins": 4.228066444396973, "rewards/rejected": -5.2359161376953125, "step": 2430 }, { "epoch": 1.2768184196755625, "grad_norm": 31.654947438021473, "learning_rate": 3.552366287709038e-07, "logits/chosen": -2.631016254425049, "logits/rejected": -2.719831943511963, "logps/chosen": -314.0375061035156, "logps/rejected": -351.51788330078125, "loss": 0.0966, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.245171457529068, "rewards/margins": 6.115281105041504, "rewards/rejected": -6.360452651977539, "step": 2440 }, { "epoch": 1.282051282051282, "grad_norm": 44.6834650269034, "learning_rate": 3.5385375325047163e-07, "logits/chosen": -2.6979851722717285, "logits/rejected": -2.687857151031494, "logps/chosen": -265.47509765625, "logps/rejected": -299.4295654296875, "loss": 0.0971, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.41958731412887573, "rewards/margins": 4.668161392211914, "rewards/rejected": -5.087749481201172, "step": 2450 }, { "epoch": 1.2872841444270016, "grad_norm": 16.30151054829352, "learning_rate": 3.524670265879353e-07, "logits/chosen": -2.729936122894287, "logits/rejected": -2.6401467323303223, "logps/chosen": -230.04672241210938, "logps/rejected": -255.76913452148438, "loss": 0.1051, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.09567772597074509, "rewards/margins": 4.742767810821533, "rewards/rejected": -4.83844518661499, "step": 2460 }, { "epoch": 1.2925170068027212, "grad_norm": 11.429201947078635, "learning_rate": 3.510765002063901e-07, "logits/chosen": -2.7029623985290527, "logits/rejected": -2.683657646179199, "logps/chosen": -256.6478576660156, "logps/rejected": -329.90338134765625, "loss": 0.0976, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.30263346433639526, "rewards/margins": 5.271126747131348, "rewards/rejected": -5.573760032653809, "step": 2470 }, { "epoch": 1.2977498691784406, "grad_norm": 12.885983344976001, "learning_rate": 3.4968222566983367e-07, "logits/chosen": -2.8190150260925293, "logits/rejected": -2.648803234100342, "logps/chosen": -260.2120666503906, "logps/rejected": -254.6842041015625, "loss": 0.1215, "rewards/accuracies": 0.875, "rewards/chosen": -0.6284938454627991, "rewards/margins": 4.271325588226318, "rewards/rejected": -4.899819374084473, "step": 2480 }, { "epoch": 1.30298273155416, "grad_norm": 22.628591752205843, "learning_rate": 3.482842546812543e-07, "logits/chosen": -2.772792339324951, "logits/rejected": -2.627633571624756, "logps/chosen": -348.0332336425781, "logps/rejected": -337.7981262207031, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": -0.25340384244918823, "rewards/margins": 5.028731346130371, "rewards/rejected": -5.282134056091309, "step": 2490 }, { "epoch": 1.3082155939298796, "grad_norm": 9.442159742249636, "learning_rate": 3.4688263908071307e-07, "logits/chosen": -2.669041156768799, "logits/rejected": -2.577634334564209, "logps/chosen": -236.87039184570312, "logps/rejected": -274.69769287109375, "loss": 0.1074, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6515072584152222, "rewards/margins": 4.638780117034912, "rewards/rejected": -5.290286540985107, "step": 2500 }, { "epoch": 1.3134484563055993, "grad_norm": 21.334255686437192, "learning_rate": 3.454774308434222e-07, "logits/chosen": -2.718568801879883, "logits/rejected": -2.6813645362854004, "logps/chosen": -252.12698364257812, "logps/rejected": -354.6980285644531, "loss": 0.1008, "rewards/accuracies": 1.0, "rewards/chosen": -0.1903163492679596, "rewards/margins": 5.55415153503418, "rewards/rejected": -5.7444682121276855, "step": 2510 }, { "epoch": 1.3186813186813187, "grad_norm": 23.562620607422794, "learning_rate": 3.4406868207781725e-07, "logits/chosen": -2.7359769344329834, "logits/rejected": -2.6525063514709473, "logps/chosen": -250.84408569335938, "logps/rejected": -237.7877655029297, "loss": 0.1239, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.18924112617969513, "rewards/margins": 4.812147617340088, "rewards/rejected": -5.001389026641846, "step": 2520 }, { "epoch": 1.323914181057038, "grad_norm": 34.75893074799262, "learning_rate": 3.426564450236249e-07, "logits/chosen": -2.726229429244995, "logits/rejected": -2.536306142807007, "logps/chosen": -262.1828918457031, "logps/rejected": -262.5714416503906, "loss": 0.1047, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.25592824816703796, "rewards/margins": 4.934244155883789, "rewards/rejected": -5.1901726722717285, "step": 2530 }, { "epoch": 1.3291470434327577, "grad_norm": 41.053053847742135, "learning_rate": 3.4124077204992576e-07, "logits/chosen": -2.5745468139648438, "logits/rejected": -2.548739194869995, "logps/chosen": -198.30911254882812, "logps/rejected": -282.16412353515625, "loss": 0.0918, "rewards/accuracies": 1.0, "rewards/chosen": 0.39724957942962646, "rewards/margins": 5.94715690612793, "rewards/rejected": -5.549907684326172, "step": 2540 }, { "epoch": 1.3343799058084773, "grad_norm": 16.140284709994052, "learning_rate": 3.398217156532125e-07, "logits/chosen": -2.7923262119293213, "logits/rejected": -2.654839038848877, "logps/chosen": -292.51959228515625, "logps/rejected": -311.15374755859375, "loss": 0.081, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.07992169260978699, "rewards/margins": 5.313913822174072, "rewards/rejected": -5.393835067749023, "step": 2550 }, { "epoch": 1.3396127681841967, "grad_norm": 21.152699853059136, "learning_rate": 3.383993284554431e-07, "logits/chosen": -2.7588140964508057, "logits/rejected": -2.6776671409606934, "logps/chosen": -268.39166259765625, "logps/rejected": -297.9541931152344, "loss": 0.0903, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.17908921837806702, "rewards/margins": 5.116482734680176, "rewards/rejected": -5.295571327209473, "step": 2560 }, { "epoch": 1.3448456305599163, "grad_norm": 31.131582166759053, "learning_rate": 3.3697366320208955e-07, "logits/chosen": -2.6680309772491455, "logits/rejected": -2.5980541706085205, "logps/chosen": -304.59014892578125, "logps/rejected": -317.4519958496094, "loss": 0.0835, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.25217151641845703, "rewards/margins": 4.94983434677124, "rewards/rejected": -5.202005863189697, "step": 2570 }, { "epoch": 1.3500784929356358, "grad_norm": 23.014778992969436, "learning_rate": 3.355447727601816e-07, "logits/chosen": -2.6754276752471924, "logits/rejected": -2.528729200363159, "logps/chosen": -262.725341796875, "logps/rejected": -312.8125305175781, "loss": 0.1066, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5141451358795166, "rewards/margins": 5.295892238616943, "rewards/rejected": -5.810038089752197, "step": 2580 }, { "epoch": 1.3553113553113554, "grad_norm": 32.05773769051717, "learning_rate": 3.3411271011634697e-07, "logits/chosen": -2.6619973182678223, "logits/rejected": -2.695253610610962, "logps/chosen": -320.0899658203125, "logps/rejected": -372.51043701171875, "loss": 0.1251, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9629742503166199, "rewards/margins": 4.419332027435303, "rewards/rejected": -5.382306098937988, "step": 2590 }, { "epoch": 1.3605442176870748, "grad_norm": 25.542660245310532, "learning_rate": 3.3267752837484587e-07, "logits/chosen": -2.617685556411743, "logits/rejected": -2.566694498062134, "logps/chosen": -241.1839599609375, "logps/rejected": -275.4180908203125, "loss": 0.1188, "rewards/accuracies": 1.0, "rewards/chosen": -0.5535500645637512, "rewards/margins": 4.251681327819824, "rewards/rejected": -4.805230140686035, "step": 2600 }, { "epoch": 1.3657770800627944, "grad_norm": 43.00702974824487, "learning_rate": 3.31239280755602e-07, "logits/chosen": -2.699061870574951, "logits/rejected": -2.583082675933838, "logps/chosen": -310.5912170410156, "logps/rejected": -307.32171630859375, "loss": 0.1019, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.540542483329773, "rewards/margins": 4.341391086578369, "rewards/rejected": -4.881933689117432, "step": 2610 }, { "epoch": 1.3710099424385138, "grad_norm": 18.9420177678843, "learning_rate": 3.2979802059222936e-07, "logits/chosen": -2.711057662963867, "logits/rejected": -2.570793867111206, "logps/chosen": -295.284423828125, "logps/rejected": -278.60174560546875, "loss": 0.1096, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2152744084596634, "rewards/margins": 4.452646732330322, "rewards/rejected": -4.6679205894470215, "step": 2620 }, { "epoch": 1.3762428048142334, "grad_norm": 30.053089697599695, "learning_rate": 3.283538013300537e-07, "logits/chosen": -2.5685324668884277, "logits/rejected": -2.5950772762298584, "logps/chosen": -223.4586944580078, "logps/rejected": -321.4066467285156, "loss": 0.0852, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5875605344772339, "rewards/margins": 4.971282005310059, "rewards/rejected": -5.558842658996582, "step": 2630 }, { "epoch": 1.3814756671899528, "grad_norm": 17.373110941185526, "learning_rate": 3.269066765241314e-07, "logits/chosen": -2.748260974884033, "logits/rejected": -2.680387496948242, "logps/chosen": -284.13665771484375, "logps/rejected": -295.009033203125, "loss": 0.092, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7688173055648804, "rewards/margins": 4.525506019592285, "rewards/rejected": -5.294323921203613, "step": 2640 }, { "epoch": 1.3867085295656725, "grad_norm": 41.146953621312406, "learning_rate": 3.254566998372634e-07, "logits/chosen": -2.5892138481140137, "logits/rejected": -2.644347667694092, "logps/chosen": -216.08114624023438, "logps/rejected": -313.13763427734375, "loss": 0.1371, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.8023455739021301, "rewards/margins": 5.653847694396973, "rewards/rejected": -6.456193447113037, "step": 2650 }, { "epoch": 1.3919413919413919, "grad_norm": 25.812159427086776, "learning_rate": 3.2400392503800477e-07, "logits/chosen": -2.6864781379699707, "logits/rejected": -2.681652069091797, "logps/chosen": -313.7518005371094, "logps/rejected": -412.9150390625, "loss": 0.1002, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4538533091545105, "rewards/margins": 4.896275520324707, "rewards/rejected": -5.350129127502441, "step": 2660 }, { "epoch": 1.3971742543171115, "grad_norm": 11.560347498002699, "learning_rate": 3.225484059986715e-07, "logits/chosen": -2.7019784450531006, "logits/rejected": -2.574402093887329, "logps/chosen": -263.1709899902344, "logps/rejected": -301.37896728515625, "loss": 0.0824, "rewards/accuracies": 1.0, "rewards/chosen": -0.5435574054718018, "rewards/margins": 4.627568244934082, "rewards/rejected": -5.1711249351501465, "step": 2670 }, { "epoch": 1.402407116692831, "grad_norm": 36.77469677124266, "learning_rate": 3.2109019669334215e-07, "logits/chosen": -2.6268675327301025, "logits/rejected": -2.5382723808288574, "logps/chosen": -345.9691467285156, "logps/rejected": -365.1632080078125, "loss": 0.1124, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5941765904426575, "rewards/margins": 5.553042411804199, "rewards/rejected": -6.147218704223633, "step": 2680 }, { "epoch": 1.4076399790685505, "grad_norm": 20.51859627224848, "learning_rate": 3.19629351195857e-07, "logits/chosen": -2.6670846939086914, "logits/rejected": -2.5615813732147217, "logps/chosen": -259.65057373046875, "logps/rejected": -329.07183837890625, "loss": 0.1092, "rewards/accuracies": 1.0, "rewards/chosen": -0.14442971348762512, "rewards/margins": 5.420745372772217, "rewards/rejected": -5.565175533294678, "step": 2690 }, { "epoch": 1.41287284144427, "grad_norm": 22.506796085146252, "learning_rate": 3.1816592367781236e-07, "logits/chosen": -2.629802942276001, "logits/rejected": -2.4548943042755127, "logps/chosen": -325.84893798828125, "logps/rejected": -310.5500183105469, "loss": 0.0889, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.2448773384094238, "rewards/margins": 4.603526592254639, "rewards/rejected": -5.848404407501221, "step": 2700 }, { "epoch": 1.4181057038199896, "grad_norm": 30.24510148007379, "learning_rate": 3.166999684065521e-07, "logits/chosen": -2.6534583568573, "logits/rejected": -2.5290088653564453, "logps/chosen": -265.690185546875, "logps/rejected": -276.3821716308594, "loss": 0.1173, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9110490679740906, "rewards/margins": 4.266907691955566, "rewards/rejected": -5.177957057952881, "step": 2710 }, { "epoch": 1.423338566195709, "grad_norm": 14.93135102150587, "learning_rate": 3.1523153974315497e-07, "logits/chosen": -2.67543625831604, "logits/rejected": -2.6198840141296387, "logps/chosen": -276.8697814941406, "logps/rejected": -302.6800842285156, "loss": 0.1174, "rewards/accuracies": 0.875, "rewards/chosen": -0.37720829248428345, "rewards/margins": 5.005181312561035, "rewards/rejected": -5.382389545440674, "step": 2720 }, { "epoch": 1.4285714285714286, "grad_norm": 63.23687917036645, "learning_rate": 3.137606921404191e-07, "logits/chosen": -2.613619327545166, "logits/rejected": -2.511970043182373, "logps/chosen": -288.4800109863281, "logps/rejected": -270.8289489746094, "loss": 0.1467, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.8941848874092102, "rewards/margins": 3.921539783477783, "rewards/rejected": -4.815724849700928, "step": 2730 }, { "epoch": 1.433804290947148, "grad_norm": 41.96001880791081, "learning_rate": 3.1228748014084243e-07, "logits/chosen": -2.45249342918396, "logits/rejected": -2.4123005867004395, "logps/chosen": -288.26263427734375, "logps/rejected": -298.2550354003906, "loss": 0.1287, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5363945960998535, "rewards/margins": 4.373542785644531, "rewards/rejected": -4.909936904907227, "step": 2740 }, { "epoch": 1.4390371533228676, "grad_norm": 11.588371849208805, "learning_rate": 3.108119583746005e-07, "logits/chosen": -2.555427312850952, "logits/rejected": -2.5331249237060547, "logps/chosen": -234.6470947265625, "logps/rejected": -295.04730224609375, "loss": 0.1127, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.062489427626132965, "rewards/margins": 5.1241350173950195, "rewards/rejected": -5.061646461486816, "step": 2750 }, { "epoch": 1.4442700156985873, "grad_norm": 28.25560385042178, "learning_rate": 3.093341815575202e-07, "logits/chosen": -2.606266498565674, "logits/rejected": -2.4748575687408447, "logps/chosen": -277.20196533203125, "logps/rejected": -246.5891571044922, "loss": 0.089, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3125464916229248, "rewards/margins": 4.526204586029053, "rewards/rejected": -4.838751792907715, "step": 2760 }, { "epoch": 1.4495028780743067, "grad_norm": 13.776303617966635, "learning_rate": 3.078542044890513e-07, "logits/chosen": -2.6697518825531006, "logits/rejected": -2.4930496215820312, "logps/chosen": -332.73040771484375, "logps/rejected": -347.8982238769531, "loss": 0.1193, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4222649037837982, "rewards/margins": 5.524443626403809, "rewards/rejected": -5.946708679199219, "step": 2770 }, { "epoch": 1.454735740450026, "grad_norm": 19.363326441674072, "learning_rate": 3.0637208205023386e-07, "logits/chosen": -2.771562099456787, "logits/rejected": -2.588222026824951, "logps/chosen": -310.1404724121094, "logps/rejected": -287.75677490234375, "loss": 0.1085, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3928970396518707, "rewards/margins": 4.459607124328613, "rewards/rejected": -4.852504730224609, "step": 2780 }, { "epoch": 1.4599686028257457, "grad_norm": 15.266068407971215, "learning_rate": 3.0488786920166343e-07, "logits/chosen": -2.6312241554260254, "logits/rejected": -2.6789093017578125, "logps/chosen": -300.2938232421875, "logps/rejected": -374.1767883300781, "loss": 0.0948, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.1141195073723793, "rewards/margins": 5.68836784362793, "rewards/rejected": -5.574248313903809, "step": 2790 }, { "epoch": 1.4652014652014653, "grad_norm": 25.032616813940045, "learning_rate": 3.034016209814529e-07, "logits/chosen": -2.6470537185668945, "logits/rejected": -2.5894312858581543, "logps/chosen": -265.81683349609375, "logps/rejected": -308.41424560546875, "loss": 0.1024, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5616492033004761, "rewards/margins": 4.933659076690674, "rewards/rejected": -5.495308876037598, "step": 2800 }, { "epoch": 1.4704343275771847, "grad_norm": 17.419430600072115, "learning_rate": 3.0191339250319147e-07, "logits/chosen": -2.668083906173706, "logits/rejected": -2.698418378829956, "logps/chosen": -287.32818603515625, "logps/rejected": -362.6585388183594, "loss": 0.0869, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.04601895809173584, "rewards/margins": 5.480045795440674, "rewards/rejected": -5.526064872741699, "step": 2810 }, { "epoch": 1.4756671899529041, "grad_norm": 17.132397160309466, "learning_rate": 3.004232389539011e-07, "logits/chosen": -2.7672345638275146, "logits/rejected": -2.6934361457824707, "logps/chosen": -272.5774841308594, "logps/rejected": -327.59991455078125, "loss": 0.0906, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.19477109611034393, "rewards/margins": 6.1975226402282715, "rewards/rejected": -6.392293930053711, "step": 2820 }, { "epoch": 1.4809000523286238, "grad_norm": 6.197135471084547, "learning_rate": 2.989312155919898e-07, "logits/chosen": -2.6563096046447754, "logits/rejected": -2.5722525119781494, "logps/chosen": -264.55035400390625, "logps/rejected": -326.4817199707031, "loss": 0.1031, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.24614736437797546, "rewards/margins": 4.663209915161133, "rewards/rejected": -4.909357070922852, "step": 2830 }, { "epoch": 1.4861329147043434, "grad_norm": 23.032654080389246, "learning_rate": 2.9743737774520266e-07, "logits/chosen": -2.6749980449676514, "logits/rejected": -2.6537280082702637, "logps/chosen": -272.3828430175781, "logps/rejected": -325.3900451660156, "loss": 0.1257, "rewards/accuracies": 1.0, "rewards/chosen": 0.11704929172992706, "rewards/margins": 6.074965953826904, "rewards/rejected": -5.957917213439941, "step": 2840 }, { "epoch": 1.4913657770800628, "grad_norm": 21.090303903647936, "learning_rate": 2.959417808085702e-07, "logits/chosen": -2.61649751663208, "logits/rejected": -2.6308839321136475, "logps/chosen": -227.5465850830078, "logps/rejected": -272.5039367675781, "loss": 0.0964, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8368734121322632, "rewards/margins": 4.769274711608887, "rewards/rejected": -5.6061482429504395, "step": 2850 }, { "epoch": 1.4965986394557822, "grad_norm": 29.6313494223259, "learning_rate": 2.944444802423542e-07, "logits/chosen": -2.805844306945801, "logits/rejected": -2.721168041229248, "logps/chosen": -317.97711181640625, "logps/rejected": -380.91412353515625, "loss": 0.0985, "rewards/accuracies": 1.0, "rewards/chosen": -0.21231107413768768, "rewards/margins": 5.917481899261475, "rewards/rejected": -6.129792213439941, "step": 2860 }, { "epoch": 1.5018315018315018, "grad_norm": 22.819722944342093, "learning_rate": 2.929455315699908e-07, "logits/chosen": -2.660499334335327, "logits/rejected": -2.4537293910980225, "logps/chosen": -310.8629455566406, "logps/rejected": -357.47894287109375, "loss": 0.1059, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.07246246188879013, "rewards/margins": 6.156960964202881, "rewards/rejected": -6.229423522949219, "step": 2870 }, { "epoch": 1.5070643642072215, "grad_norm": 15.35991384284126, "learning_rate": 2.9144499037603204e-07, "logits/chosen": -2.739063262939453, "logits/rejected": -2.6308000087738037, "logps/chosen": -257.1328430175781, "logps/rejected": -290.7419128417969, "loss": 0.1176, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5068913698196411, "rewards/margins": 5.11258602142334, "rewards/rejected": -5.61947774887085, "step": 2880 }, { "epoch": 1.5122972265829409, "grad_norm": 17.13609231258807, "learning_rate": 2.899429123040843e-07, "logits/chosen": -2.7439727783203125, "logits/rejected": -2.716459274291992, "logps/chosen": -262.11224365234375, "logps/rejected": -316.2181091308594, "loss": 0.0886, "rewards/accuracies": 1.0, "rewards/chosen": -0.5449376702308655, "rewards/margins": 4.632278919219971, "rewards/rejected": -5.177216529846191, "step": 2890 }, { "epoch": 1.5175300889586603, "grad_norm": 42.3086232496891, "learning_rate": 2.884393530547452e-07, "logits/chosen": -2.8514835834503174, "logits/rejected": -2.7157387733459473, "logps/chosen": -298.56719970703125, "logps/rejected": -331.87890625, "loss": 0.1074, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.17806626856327057, "rewards/margins": 4.861222267150879, "rewards/rejected": -4.6831560134887695, "step": 2900 }, { "epoch": 1.5227629513343799, "grad_norm": 23.22923804226432, "learning_rate": 2.869343683835376e-07, "logits/chosen": -2.709092140197754, "logits/rejected": -2.59346342086792, "logps/chosen": -239.79421997070312, "logps/rejected": -342.177978515625, "loss": 0.09, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3563653826713562, "rewards/margins": 5.73477840423584, "rewards/rejected": -6.09114408493042, "step": 2910 }, { "epoch": 1.5279958137100995, "grad_norm": 6.575306568756043, "learning_rate": 2.8542801409884253e-07, "logits/chosen": -2.766979932785034, "logits/rejected": -2.666149139404297, "logps/chosen": -330.9672546386719, "logps/rejected": -371.9808349609375, "loss": 0.0779, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.5268090963363647, "rewards/margins": 4.619814872741699, "rewards/rejected": -5.1466240882873535, "step": 2920 }, { "epoch": 1.533228676085819, "grad_norm": 29.973424802649863, "learning_rate": 2.839203460598297e-07, "logits/chosen": -2.7906851768493652, "logits/rejected": -2.756540298461914, "logps/chosen": -349.83331298828125, "logps/rejected": -379.7663269042969, "loss": 0.1136, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.3085012137889862, "rewards/margins": 5.198960304260254, "rewards/rejected": -5.5074615478515625, "step": 2930 }, { "epoch": 1.5384615384615383, "grad_norm": 11.458737713264366, "learning_rate": 2.8241142017438557e-07, "logits/chosen": -2.7677807807922363, "logits/rejected": -2.7230429649353027, "logps/chosen": -320.4291687011719, "logps/rejected": -336.20355224609375, "loss": 0.0997, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.15537458658218384, "rewards/margins": 5.858824253082275, "rewards/rejected": -5.7034502029418945, "step": 2940 }, { "epoch": 1.543694400837258, "grad_norm": 22.881053961998443, "learning_rate": 2.8090129239704083e-07, "logits/chosen": -2.7229790687561035, "logits/rejected": -2.5809950828552246, "logps/chosen": -307.8843078613281, "logps/rejected": -268.83984375, "loss": 0.1282, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.1330840587615967, "rewards/margins": 4.647091388702393, "rewards/rejected": -5.780175685882568, "step": 2950 }, { "epoch": 1.5489272632129776, "grad_norm": 14.391989006889863, "learning_rate": 2.7939001872689496e-07, "logits/chosen": -2.6156535148620605, "logits/rejected": -2.547813653945923, "logps/chosen": -214.3231658935547, "logps/rejected": -244.09432983398438, "loss": 0.1027, "rewards/accuracies": 0.875, "rewards/chosen": -0.9809154272079468, "rewards/margins": 4.217734336853027, "rewards/rejected": -5.198649883270264, "step": 2960 }, { "epoch": 1.554160125588697, "grad_norm": 47.42639642849719, "learning_rate": 2.778776552055398e-07, "logits/chosen": -2.628086566925049, "logits/rejected": -2.4464826583862305, "logps/chosen": -306.44354248046875, "logps/rejected": -310.10003662109375, "loss": 0.0903, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7761075496673584, "rewards/margins": 4.894813537597656, "rewards/rejected": -5.6709208488464355, "step": 2970 }, { "epoch": 1.5593929879644164, "grad_norm": 26.12349026205923, "learning_rate": 2.763642579149817e-07, "logits/chosen": -2.5449676513671875, "logits/rejected": -2.521742343902588, "logps/chosen": -250.126953125, "logps/rejected": -307.7526550292969, "loss": 0.1035, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5303369760513306, "rewards/margins": 4.585556983947754, "rewards/rejected": -5.115893840789795, "step": 2980 }, { "epoch": 1.564625850340136, "grad_norm": 55.23272129571936, "learning_rate": 2.748498829755615e-07, "logits/chosen": -2.6269307136535645, "logits/rejected": -2.5769810676574707, "logps/chosen": -268.09967041015625, "logps/rejected": -364.55712890625, "loss": 0.0956, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.2840079367160797, "rewards/margins": 5.313620090484619, "rewards/rejected": -5.597627639770508, "step": 2990 }, { "epoch": 1.5698587127158556, "grad_norm": 16.075641960102608, "learning_rate": 2.7333458654387344e-07, "logits/chosen": -2.7007858753204346, "logits/rejected": -2.648383855819702, "logps/chosen": -303.09234619140625, "logps/rejected": -318.8572082519531, "loss": 0.0843, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.17780855298042297, "rewards/margins": 5.029606819152832, "rewards/rejected": -5.207415580749512, "step": 3000 }, { "epoch": 1.575091575091575, "grad_norm": 23.826973600299446, "learning_rate": 2.718184248106828e-07, "logits/chosen": -2.797697067260742, "logits/rejected": -2.664506435394287, "logps/chosen": -342.29132080078125, "logps/rejected": -380.3262023925781, "loss": 0.0953, "rewards/accuracies": 1.0, "rewards/chosen": -0.23088698089122772, "rewards/margins": 5.9646759033203125, "rewards/rejected": -6.195563316345215, "step": 3010 }, { "epoch": 1.5803244374672945, "grad_norm": 23.5189098138642, "learning_rate": 2.7030145399884275e-07, "logits/chosen": -2.7051501274108887, "logits/rejected": -2.552473783493042, "logps/chosen": -363.2586364746094, "logps/rejected": -355.2478942871094, "loss": 0.1077, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.46280232071876526, "rewards/margins": 4.690756320953369, "rewards/rejected": -5.153558254241943, "step": 3020 }, { "epoch": 1.585557299843014, "grad_norm": 19.565421791870527, "learning_rate": 2.687837303612085e-07, "logits/chosen": -2.8115456104278564, "logits/rejected": -2.6637587547302246, "logps/chosen": -344.15252685546875, "logps/rejected": -364.0646057128906, "loss": 0.1045, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.13493283092975616, "rewards/margins": 5.432315349578857, "rewards/rejected": -5.567248344421387, "step": 3030 }, { "epoch": 1.5907901622187337, "grad_norm": 21.638865724708594, "learning_rate": 2.672653101785519e-07, "logits/chosen": -2.5982906818389893, "logits/rejected": -2.5806329250335693, "logps/chosen": -305.8501892089844, "logps/rejected": -348.97747802734375, "loss": 0.0805, "rewards/accuracies": 1.0, "rewards/chosen": -0.1506006270647049, "rewards/margins": 5.457698345184326, "rewards/rejected": -5.6082987785339355, "step": 3040 }, { "epoch": 1.5960230245944533, "grad_norm": 21.673302232000832, "learning_rate": 2.657462497574747e-07, "logits/chosen": -2.7238268852233887, "logits/rejected": -2.709031581878662, "logps/chosen": -238.9906005859375, "logps/rejected": -276.9991760253906, "loss": 0.0811, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5081533193588257, "rewards/margins": 4.087741374969482, "rewards/rejected": -4.595894813537598, "step": 3050 }, { "epoch": 1.6012558869701727, "grad_norm": 5.575995707313206, "learning_rate": 2.642266054283198e-07, "logits/chosen": -2.7808871269226074, "logits/rejected": -2.5478625297546387, "logps/chosen": -366.7637023925781, "logps/rejected": -281.3974914550781, "loss": 0.0798, "rewards/accuracies": 1.0, "rewards/chosen": -0.06902514398097992, "rewards/margins": 5.459024429321289, "rewards/rejected": -5.528049468994141, "step": 3060 }, { "epoch": 1.6064887493458921, "grad_norm": 24.531441519428913, "learning_rate": 2.627064335430829e-07, "logits/chosen": -2.713057279586792, "logits/rejected": -2.571733236312866, "logps/chosen": -321.02972412109375, "logps/rejected": -332.35870361328125, "loss": 0.0769, "rewards/accuracies": 1.0, "rewards/chosen": -0.3989798128604889, "rewards/margins": 5.438555717468262, "rewards/rejected": -5.837535858154297, "step": 3070 }, { "epoch": 1.6117216117216118, "grad_norm": 32.83788593503646, "learning_rate": 2.611857904733227e-07, "logits/chosen": -2.697803497314453, "logits/rejected": -2.5155491828918457, "logps/chosen": -309.11212158203125, "logps/rejected": -297.1321716308594, "loss": 0.0916, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6532170176506042, "rewards/margins": 5.0370306968688965, "rewards/rejected": -5.690248012542725, "step": 3080 }, { "epoch": 1.6169544740973314, "grad_norm": 35.85854683798888, "learning_rate": 2.5966473260807076e-07, "logits/chosen": -2.7497193813323975, "logits/rejected": -2.6400251388549805, "logps/chosen": -353.42706298828125, "logps/rejected": -394.00653076171875, "loss": 0.0988, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.09235533326864243, "rewards/margins": 6.314970970153809, "rewards/rejected": -6.407326698303223, "step": 3090 }, { "epoch": 1.6221873364730508, "grad_norm": 24.863574110452976, "learning_rate": 2.5814331635173987e-07, "logits/chosen": -2.6918444633483887, "logits/rejected": -2.622985601425171, "logps/chosen": -312.4832763671875, "logps/rejected": -347.6034240722656, "loss": 0.1311, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5141621828079224, "rewards/margins": 4.177474498748779, "rewards/rejected": -4.6916375160217285, "step": 3100 }, { "epoch": 1.6274201988487702, "grad_norm": 43.07833472677117, "learning_rate": 2.566215981220331e-07, "logits/chosen": -2.6178548336029053, "logits/rejected": -2.5270726680755615, "logps/chosen": -311.21502685546875, "logps/rejected": -358.38018798828125, "loss": 0.105, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.01169753074646, "rewards/margins": 4.856429100036621, "rewards/rejected": -5.868125915527344, "step": 3110 }, { "epoch": 1.6326530612244898, "grad_norm": 26.45332527831414, "learning_rate": 2.550996343478514e-07, "logits/chosen": -2.633014440536499, "logits/rejected": -2.5973496437072754, "logps/chosen": -304.20880126953125, "logps/rejected": -340.53369140625, "loss": 0.1131, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.1011536568403244, "rewards/margins": 5.776803016662598, "rewards/rejected": -5.877956390380859, "step": 3120 }, { "epoch": 1.6378859236002095, "grad_norm": 10.618056558670146, "learning_rate": 2.5357748146720076e-07, "logits/chosen": -2.6644821166992188, "logits/rejected": -2.5023844242095947, "logps/chosen": -209.9169158935547, "logps/rejected": -265.97967529296875, "loss": 0.0813, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.550481915473938, "rewards/margins": 5.262048721313477, "rewards/rejected": -5.812530517578125, "step": 3130 }, { "epoch": 1.6431187859759289, "grad_norm": 28.997064937091526, "learning_rate": 2.5205519592509993e-07, "logits/chosen": -2.6454596519470215, "logits/rejected": -2.534219264984131, "logps/chosen": -272.67926025390625, "logps/rejected": -313.68365478515625, "loss": 0.1097, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5680716633796692, "rewards/margins": 5.149653434753418, "rewards/rejected": -5.717724800109863, "step": 3140 }, { "epoch": 1.6483516483516483, "grad_norm": 29.98175418753011, "learning_rate": 2.505328341714873e-07, "logits/chosen": -2.7715792655944824, "logits/rejected": -2.577012538909912, "logps/chosen": -317.41448974609375, "logps/rejected": -342.84649658203125, "loss": 0.0798, "rewards/accuracies": 1.0, "rewards/chosen": 0.03935631364583969, "rewards/margins": 6.34078311920166, "rewards/rejected": -6.301426410675049, "step": 3150 }, { "epoch": 1.653584510727368, "grad_norm": 30.141923742595296, "learning_rate": 2.4901045265912687e-07, "logits/chosen": -2.7321019172668457, "logits/rejected": -2.657607078552246, "logps/chosen": -314.8836975097656, "logps/rejected": -374.3489685058594, "loss": 0.1033, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.007449048571288586, "rewards/margins": 6.138455390930176, "rewards/rejected": -6.145905017852783, "step": 3160 }, { "epoch": 1.6588173731030875, "grad_norm": 24.24799993269991, "learning_rate": 2.4748810784151555e-07, "logits/chosen": -2.685410976409912, "logits/rejected": -2.6096012592315674, "logps/chosen": -332.5256042480469, "logps/rejected": -311.61358642578125, "loss": 0.0815, "rewards/accuracies": 1.0, "rewards/chosen": -0.7060562372207642, "rewards/margins": 4.939300537109375, "rewards/rejected": -5.645357131958008, "step": 3170 }, { "epoch": 1.664050235478807, "grad_norm": 38.71590851754405, "learning_rate": 2.459658561707898e-07, "logits/chosen": -2.694617986679077, "logits/rejected": -2.61140513420105, "logps/chosen": -319.1321716308594, "logps/rejected": -358.1539611816406, "loss": 0.0954, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.4491146504878998, "rewards/margins": 4.88021993637085, "rewards/rejected": -5.329334735870361, "step": 3180 }, { "epoch": 1.6692830978545263, "grad_norm": 27.476389343518058, "learning_rate": 2.4444375409563145e-07, "logits/chosen": -2.716930627822876, "logits/rejected": -2.5793745517730713, "logps/chosen": -314.3849182128906, "logps/rejected": -336.908203125, "loss": 0.0794, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6793092489242554, "rewards/margins": 5.401398658752441, "rewards/rejected": -6.080708026885986, "step": 3190 }, { "epoch": 1.674515960230246, "grad_norm": 25.09742034299905, "learning_rate": 2.429218580591753e-07, "logits/chosen": -2.5837604999542236, "logits/rejected": -2.4644665718078613, "logps/chosen": -329.11737060546875, "logps/rejected": -298.93109130859375, "loss": 0.1147, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.27274736762046814, "rewards/margins": 5.840318202972412, "rewards/rejected": -6.113064765930176, "step": 3200 }, { "epoch": 1.6797488226059656, "grad_norm": 48.48521359435761, "learning_rate": 2.414002244969158e-07, "logits/chosen": -2.553994655609131, "logits/rejected": -2.5049333572387695, "logps/chosen": -283.4375, "logps/rejected": -335.94732666015625, "loss": 0.1058, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.013826847076416, "rewards/margins": 6.045845031738281, "rewards/rejected": -7.059671878814697, "step": 3210 }, { "epoch": 1.684981684981685, "grad_norm": 25.83087216367204, "learning_rate": 2.3987890983461403e-07, "logits/chosen": -2.7287845611572266, "logits/rejected": -2.6246705055236816, "logps/chosen": -321.7281799316406, "logps/rejected": -388.2299499511719, "loss": 0.0967, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6154420375823975, "rewards/margins": 5.903476238250732, "rewards/rejected": -6.518918037414551, "step": 3220 }, { "epoch": 1.6902145473574044, "grad_norm": 26.08572202767128, "learning_rate": 2.3835797048620564e-07, "logits/chosen": -2.754236936569214, "logits/rejected": -2.670435905456543, "logps/chosen": -292.9166564941406, "logps/rejected": -303.14361572265625, "loss": 0.1092, "rewards/accuracies": 1.0, "rewards/chosen": -0.6038793325424194, "rewards/margins": 4.955503940582275, "rewards/rejected": -5.559382915496826, "step": 3230 }, { "epoch": 1.695447409733124, "grad_norm": 52.292973177723304, "learning_rate": 2.368374628517088e-07, "logits/chosen": -2.5498924255371094, "logits/rejected": -2.4609293937683105, "logps/chosen": -298.534423828125, "logps/rejected": -317.09130859375, "loss": 0.121, "rewards/accuracies": 1.0, "rewards/chosen": -0.5181295275688171, "rewards/margins": 5.679882049560547, "rewards/rejected": -6.198011875152588, "step": 3240 }, { "epoch": 1.7006802721088436, "grad_norm": 6.4370144608987845, "learning_rate": 2.3531744331513247e-07, "logits/chosen": -2.6444571018218994, "logits/rejected": -2.669032335281372, "logps/chosen": -245.9746856689453, "logps/rejected": -309.67620849609375, "loss": 0.0912, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6865631341934204, "rewards/margins": 5.3827409744262695, "rewards/rejected": -6.0693039894104, "step": 3250 }, { "epoch": 1.705913134484563, "grad_norm": 54.288581382516014, "learning_rate": 2.3379796824238608e-07, "logits/chosen": -2.6092073917388916, "logits/rejected": -2.5859627723693848, "logps/chosen": -235.48385620117188, "logps/rejected": -263.9149169921875, "loss": 0.1273, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.5413236618041992, "rewards/margins": 4.505577564239502, "rewards/rejected": -6.046900749206543, "step": 3260 }, { "epoch": 1.7111459968602825, "grad_norm": 20.066394526978875, "learning_rate": 2.3227909397918894e-07, "logits/chosen": -2.861052989959717, "logits/rejected": -2.7720844745635986, "logps/chosen": -339.01605224609375, "logps/rejected": -389.1539001464844, "loss": 0.0829, "rewards/accuracies": 1.0, "rewards/chosen": -0.003049397375434637, "rewards/margins": 6.6782331466674805, "rewards/rejected": -6.681282997131348, "step": 3270 }, { "epoch": 1.716378859236002, "grad_norm": 36.56452804031963, "learning_rate": 2.3076087684898076e-07, "logits/chosen": -2.6908583641052246, "logits/rejected": -2.5621867179870605, "logps/chosen": -282.6653137207031, "logps/rejected": -338.23291015625, "loss": 0.1145, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5438255667686462, "rewards/margins": 5.641651153564453, "rewards/rejected": -6.185477256774902, "step": 3280 }, { "epoch": 1.7216117216117217, "grad_norm": 58.679378010104585, "learning_rate": 2.2924337315083353e-07, "logits/chosen": -2.7397890090942383, "logits/rejected": -2.5919830799102783, "logps/chosen": -377.1933898925781, "logps/rejected": -388.566162109375, "loss": 0.0779, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.14386649429798126, "rewards/margins": 5.803525447845459, "rewards/rejected": -5.947391986846924, "step": 3290 }, { "epoch": 1.7268445839874411, "grad_norm": 22.7359891911816, "learning_rate": 2.277266391573633e-07, "logits/chosen": -2.6971657276153564, "logits/rejected": -2.660691738128662, "logps/chosen": -338.55120849609375, "logps/rejected": -340.1263732910156, "loss": 0.0776, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.48269566893577576, "rewards/margins": 7.384807586669922, "rewards/rejected": -6.902112007141113, "step": 3300 }, { "epoch": 1.7320774463631605, "grad_norm": 51.56529486380081, "learning_rate": 2.2621073111264357e-07, "logits/chosen": -2.5473222732543945, "logits/rejected": -2.5487756729125977, "logps/chosen": -273.0016784667969, "logps/rejected": -290.00872802734375, "loss": 0.0931, "rewards/accuracies": 1.0, "rewards/chosen": -0.2249443233013153, "rewards/margins": 5.341296672821045, "rewards/rejected": -5.5662407875061035, "step": 3310 }, { "epoch": 1.7373103087388801, "grad_norm": 29.535893549004907, "learning_rate": 2.2469570523011993e-07, "logits/chosen": -2.6070313453674316, "logits/rejected": -2.6149723529815674, "logps/chosen": -270.3302917480469, "logps/rejected": -319.1665344238281, "loss": 0.096, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.9645935893058777, "rewards/margins": 4.5530548095703125, "rewards/rejected": -5.517648696899414, "step": 3320 }, { "epoch": 1.7425431711145998, "grad_norm": 4.27376987274256, "learning_rate": 2.2318161769052525e-07, "logits/chosen": -2.659609317779541, "logits/rejected": -2.533578395843506, "logps/chosen": -279.2534484863281, "logps/rejected": -334.467041015625, "loss": 0.1114, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5925628542900085, "rewards/margins": 5.468548774719238, "rewards/rejected": -6.0611114501953125, "step": 3330 }, { "epoch": 1.7477760334903192, "grad_norm": 11.254161444342154, "learning_rate": 2.2166852463979624e-07, "logits/chosen": -2.5818896293640137, "logits/rejected": -2.4699618816375732, "logps/chosen": -266.83355712890625, "logps/rejected": -278.3815002441406, "loss": 0.1153, "rewards/accuracies": 1.0, "rewards/chosen": -0.40772876143455505, "rewards/margins": 5.459908485412598, "rewards/rejected": -5.867638111114502, "step": 3340 }, { "epoch": 1.7530088958660386, "grad_norm": 9.929057072796711, "learning_rate": 2.20156482186992e-07, "logits/chosen": -2.58548903465271, "logits/rejected": -2.5589373111724854, "logps/chosen": -282.1081237792969, "logps/rejected": -333.6109924316406, "loss": 0.1012, "rewards/accuracies": 1.0, "rewards/chosen": -0.4569730758666992, "rewards/margins": 5.3031792640686035, "rewards/rejected": -5.7601518630981445, "step": 3350 }, { "epoch": 1.7582417582417582, "grad_norm": 14.366597370754016, "learning_rate": 2.1864554640221244e-07, "logits/chosen": -2.486529588699341, "logits/rejected": -2.5694050788879395, "logps/chosen": -220.8901824951172, "logps/rejected": -341.2018127441406, "loss": 0.1006, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.232269048690796, "rewards/margins": 5.7011613845825195, "rewards/rejected": -6.933429718017578, "step": 3360 }, { "epoch": 1.7634746206174778, "grad_norm": 9.734727207350373, "learning_rate": 2.1713577331452016e-07, "logits/chosen": -2.7107510566711426, "logits/rejected": -2.590179681777954, "logps/chosen": -282.2343444824219, "logps/rejected": -292.1159362792969, "loss": 0.0792, "rewards/accuracies": 1.0, "rewards/chosen": -0.47104889154434204, "rewards/margins": 5.032910346984863, "rewards/rejected": -5.5039591789245605, "step": 3370 }, { "epoch": 1.7687074829931972, "grad_norm": 30.73138043602282, "learning_rate": 2.1562721890986199e-07, "logits/chosen": -2.5853171348571777, "logits/rejected": -2.427089214324951, "logps/chosen": -258.68096923828125, "logps/rejected": -263.1603698730469, "loss": 0.0943, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8167365789413452, "rewards/margins": 4.762653350830078, "rewards/rejected": -5.579390048980713, "step": 3380 }, { "epoch": 1.7739403453689166, "grad_norm": 6.900770481857403, "learning_rate": 2.1411993912899285e-07, "logits/chosen": -2.6068625450134277, "logits/rejected": -2.7255072593688965, "logps/chosen": -251.8668670654297, "logps/rejected": -388.23553466796875, "loss": 0.0956, "rewards/accuracies": 1.0, "rewards/chosen": -0.3402079641819, "rewards/margins": 5.3291215896606445, "rewards/rejected": -5.669328689575195, "step": 3390 }, { "epoch": 1.7791732077446363, "grad_norm": 32.88005271542385, "learning_rate": 2.126139898654021e-07, "logits/chosen": -2.6007299423217773, "logits/rejected": -2.564429759979248, "logps/chosen": -247.95761108398438, "logps/rejected": -304.67218017578125, "loss": 0.1105, "rewards/accuracies": 1.0, "rewards/chosen": -1.0324971675872803, "rewards/margins": 4.652859687805176, "rewards/rejected": -5.685357093811035, "step": 3400 }, { "epoch": 1.784406070120356, "grad_norm": 14.049845011208202, "learning_rate": 2.1110942696324012e-07, "logits/chosen": -2.8334975242614746, "logits/rejected": -2.7319045066833496, "logps/chosen": -330.34368896484375, "logps/rejected": -337.543701171875, "loss": 0.1172, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.16336342692375183, "rewards/margins": 5.068566799163818, "rewards/rejected": -5.231931209564209, "step": 3410 }, { "epoch": 1.7896389324960753, "grad_norm": 35.23957207470887, "learning_rate": 2.0960630621524762e-07, "logits/chosen": -2.604213237762451, "logits/rejected": -2.5069069862365723, "logps/chosen": -327.94659423828125, "logps/rejected": -286.60223388671875, "loss": 0.0987, "rewards/accuracies": 1.0, "rewards/chosen": -0.45767927169799805, "rewards/margins": 4.978998184204102, "rewards/rejected": -5.436676979064941, "step": 3420 }, { "epoch": 1.7948717948717947, "grad_norm": 22.48343495873177, "learning_rate": 2.0810468336068697e-07, "logits/chosen": -2.60780668258667, "logits/rejected": -2.6505560874938965, "logps/chosen": -256.4802551269531, "logps/rejected": -312.28045654296875, "loss": 0.1047, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9076415300369263, "rewards/margins": 5.676671028137207, "rewards/rejected": -6.584311485290527, "step": 3430 }, { "epoch": 1.8001046572475143, "grad_norm": 24.57145458361626, "learning_rate": 2.0660461408327535e-07, "logits/chosen": -2.759155511856079, "logits/rejected": -2.6342787742614746, "logps/chosen": -315.87750244140625, "logps/rejected": -288.4949951171875, "loss": 0.079, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7314565181732178, "rewards/margins": 4.713875770568848, "rewards/rejected": -5.445333003997803, "step": 3440 }, { "epoch": 1.805337519623234, "grad_norm": 12.426591771574008, "learning_rate": 2.0510615400911906e-07, "logits/chosen": -2.7850677967071533, "logits/rejected": -2.7042925357818604, "logps/chosen": -288.1392517089844, "logps/rejected": -294.2490539550781, "loss": 0.1018, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.1649543195962906, "rewards/margins": 5.316279888153076, "rewards/rejected": -5.481234550476074, "step": 3450 }, { "epoch": 1.8105703819989536, "grad_norm": 24.464622606853844, "learning_rate": 2.0360935870465185e-07, "logits/chosen": -2.7435317039489746, "logits/rejected": -2.550192356109619, "logps/chosen": -350.0984802246094, "logps/rejected": -342.10076904296875, "loss": 0.0851, "rewards/accuracies": 1.0, "rewards/chosen": 0.36250513792037964, "rewards/margins": 6.055690765380859, "rewards/rejected": -5.693184852600098, "step": 3460 }, { "epoch": 1.815803244374673, "grad_norm": 37.4562098904198, "learning_rate": 2.021142836745739e-07, "logits/chosen": -2.6443090438842773, "logits/rejected": -2.5326123237609863, "logps/chosen": -294.5022888183594, "logps/rejected": -312.35699462890625, "loss": 0.0874, "rewards/accuracies": 1.0, "rewards/chosen": -0.3399999141693115, "rewards/margins": 4.801590919494629, "rewards/rejected": -5.1415910720825195, "step": 3470 }, { "epoch": 1.8210361067503924, "grad_norm": 39.078137882685, "learning_rate": 2.0062098435979308e-07, "logits/chosen": -2.508788585662842, "logits/rejected": -2.467001438140869, "logps/chosen": -313.6659851074219, "logps/rejected": -303.41864013671875, "loss": 0.1166, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.8432501554489136, "rewards/margins": 4.548348426818848, "rewards/rejected": -5.391598701477051, "step": 3480 }, { "epoch": 1.826268969126112, "grad_norm": 20.477804157343822, "learning_rate": 1.9912951613536997e-07, "logits/chosen": -2.7366108894348145, "logits/rejected": -2.5674004554748535, "logps/chosen": -307.49871826171875, "logps/rejected": -292.94140625, "loss": 0.0804, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4221610426902771, "rewards/margins": 5.448634147644043, "rewards/rejected": -5.870795249938965, "step": 3490 }, { "epoch": 1.8315018315018317, "grad_norm": 45.80462658045936, "learning_rate": 1.9763993430846392e-07, "logits/chosen": -2.7082409858703613, "logits/rejected": -2.4875802993774414, "logps/chosen": -282.53564453125, "logps/rejected": -249.9893798828125, "loss": 0.1, "rewards/accuracies": 1.0, "rewards/chosen": -0.5224814414978027, "rewards/margins": 4.795421600341797, "rewards/rejected": -5.317903518676758, "step": 3500 }, { "epoch": 1.836734693877551, "grad_norm": 25.28320711526729, "learning_rate": 1.9615229411628212e-07, "logits/chosen": -2.5923116207122803, "logits/rejected": -2.5757219791412354, "logps/chosen": -211.8919677734375, "logps/rejected": -322.93310546875, "loss": 0.0908, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9356802701950073, "rewards/margins": 5.03036642074585, "rewards/rejected": -5.966047286987305, "step": 3510 }, { "epoch": 1.8419675562532705, "grad_norm": 12.228474470448438, "learning_rate": 1.946666507240314e-07, "logits/chosen": -2.6713318824768066, "logits/rejected": -2.5800724029541016, "logps/chosen": -322.16314697265625, "logps/rejected": -353.5315856933594, "loss": 0.0878, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.723814845085144, "rewards/margins": 5.294778347015381, "rewards/rejected": -6.018592834472656, "step": 3520 }, { "epoch": 1.84720041862899, "grad_norm": 17.755134246003884, "learning_rate": 1.9318305922287268e-07, "logits/chosen": -2.5991291999816895, "logits/rejected": -2.5720906257629395, "logps/chosen": -276.58441162109375, "logps/rejected": -305.8017883300781, "loss": 0.0765, "rewards/accuracies": 1.0, "rewards/chosen": -0.35180261731147766, "rewards/margins": 6.045158386230469, "rewards/rejected": -6.396960735321045, "step": 3530 }, { "epoch": 1.8524332810047097, "grad_norm": 31.75867086902602, "learning_rate": 1.9170157462787762e-07, "logits/chosen": -2.6986277103424072, "logits/rejected": -2.5714402198791504, "logps/chosen": -349.29437255859375, "logps/rejected": -312.58148193359375, "loss": 0.0887, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.348706990480423, "rewards/margins": 5.397832870483398, "rewards/rejected": -5.74653959274292, "step": 3540 }, { "epoch": 1.8576661433804291, "grad_norm": 23.05380759982723, "learning_rate": 1.902222518759891e-07, "logits/chosen": -2.819509267807007, "logits/rejected": -2.602756977081299, "logps/chosen": -371.6258239746094, "logps/rejected": -367.76983642578125, "loss": 0.1148, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.30471277236938477, "rewards/margins": 5.3604960441589355, "rewards/rejected": -5.6652092933654785, "step": 3550 }, { "epoch": 1.8628990057561485, "grad_norm": 17.855176900716568, "learning_rate": 1.8874514582398368e-07, "logits/chosen": -2.6623425483703613, "logits/rejected": -2.7280948162078857, "logps/chosen": -315.5396423339844, "logps/rejected": -357.8938903808594, "loss": 0.0992, "rewards/accuracies": 1.0, "rewards/chosen": -0.47962045669555664, "rewards/margins": 6.602120876312256, "rewards/rejected": -7.081740379333496, "step": 3560 }, { "epoch": 1.8681318681318682, "grad_norm": 10.749236071749618, "learning_rate": 1.8727031124643738e-07, "logits/chosen": -2.668679714202881, "logits/rejected": -2.608853578567505, "logps/chosen": -242.77444458007812, "logps/rejected": -285.3056335449219, "loss": 0.098, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6894394755363464, "rewards/margins": 5.269094944000244, "rewards/rejected": -5.958534240722656, "step": 3570 }, { "epoch": 1.8733647305075878, "grad_norm": 22.614211293694257, "learning_rate": 1.8579780283369472e-07, "logits/chosen": -2.591667652130127, "logits/rejected": -2.419919967651367, "logps/chosen": -305.64959716796875, "logps/rejected": -277.2184143066406, "loss": 0.0907, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9471152424812317, "rewards/margins": 4.886394500732422, "rewards/rejected": -5.83350944519043, "step": 3580 }, { "epoch": 1.8785975928833072, "grad_norm": 25.65636556195965, "learning_rate": 1.8432767518984043e-07, "logits/chosen": -2.623939037322998, "logits/rejected": -2.5226635932922363, "logps/chosen": -309.31317138671875, "logps/rejected": -318.83453369140625, "loss": 0.1102, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6891399621963501, "rewards/margins": 5.144829273223877, "rewards/rejected": -5.833970069885254, "step": 3590 }, { "epoch": 1.8838304552590266, "grad_norm": 63.30301150806038, "learning_rate": 1.8285998283067478e-07, "logits/chosen": -2.726407289505005, "logits/rejected": -2.6477532386779785, "logps/chosen": -278.20355224609375, "logps/rejected": -310.3710021972656, "loss": 0.108, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6373482942581177, "rewards/margins": 5.8870849609375, "rewards/rejected": -6.524433135986328, "step": 3600 }, { "epoch": 1.8890633176347462, "grad_norm": 24.79124814354351, "learning_rate": 1.8139478018169197e-07, "logits/chosen": -2.6029670238494873, "logits/rejected": -2.540987014770508, "logps/chosen": -258.3710021972656, "logps/rejected": -285.44000244140625, "loss": 0.0944, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.813319981098175, "rewards/margins": 4.891506671905518, "rewards/rejected": -5.704827308654785, "step": 3610 }, { "epoch": 1.8942961800104658, "grad_norm": 14.002181196398896, "learning_rate": 1.799321215760617e-07, "logits/chosen": -2.6256556510925293, "logits/rejected": -2.5938618183135986, "logps/chosen": -276.6966247558594, "logps/rejected": -285.2062683105469, "loss": 0.124, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.1776187419891357, "rewards/margins": 4.646222114562988, "rewards/rejected": -5.823840141296387, "step": 3620 }, { "epoch": 1.8995290423861853, "grad_norm": 44.217394704413046, "learning_rate": 1.7847206125261476e-07, "logits/chosen": -2.6064388751983643, "logits/rejected": -2.598203659057617, "logps/chosen": -248.5193328857422, "logps/rejected": -276.5867614746094, "loss": 0.121, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.913386344909668, "rewards/margins": 5.072216987609863, "rewards/rejected": -5.985602855682373, "step": 3630 }, { "epoch": 1.9047619047619047, "grad_norm": 29.521492574877282, "learning_rate": 1.7701465335383148e-07, "logits/chosen": -2.717740535736084, "logits/rejected": -2.582475185394287, "logps/chosen": -292.51507568359375, "logps/rejected": -279.9605712890625, "loss": 0.0892, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.993351936340332, "rewards/margins": 4.013619422912598, "rewards/rejected": -5.00697135925293, "step": 3640 }, { "epoch": 1.9099947671376243, "grad_norm": 37.81792131343119, "learning_rate": 1.7555995192383377e-07, "logits/chosen": -2.6133148670196533, "logits/rejected": -2.720210552215576, "logps/chosen": -251.6924285888672, "logps/rejected": -428.26251220703125, "loss": 0.0763, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.34535667300224304, "rewards/margins": 5.914216041564941, "rewards/rejected": -6.259573459625244, "step": 3650 }, { "epoch": 1.915227629513344, "grad_norm": 45.15317768585093, "learning_rate": 1.7410801090638166e-07, "logits/chosen": -2.6592507362365723, "logits/rejected": -2.568499803543091, "logps/chosen": -309.73883056640625, "logps/rejected": -313.0323181152344, "loss": 0.1276, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4672146439552307, "rewards/margins": 5.2041521072387695, "rewards/rejected": -5.6713666915893555, "step": 3660 }, { "epoch": 1.9204604918890633, "grad_norm": 14.599937120025656, "learning_rate": 1.7265888414287245e-07, "logits/chosen": -2.712362766265869, "logits/rejected": -2.679932117462158, "logps/chosen": -296.2633056640625, "logps/rejected": -332.5452575683594, "loss": 0.1004, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4468228816986084, "rewards/margins": 6.227715015411377, "rewards/rejected": -6.674537658691406, "step": 3670 }, { "epoch": 1.9256933542647827, "grad_norm": 42.370504342423075, "learning_rate": 1.7121262537034396e-07, "logits/chosen": -2.75547456741333, "logits/rejected": -2.597177028656006, "logps/chosen": -320.3974609375, "logps/rejected": -316.8236389160156, "loss": 0.1182, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6133803129196167, "rewards/margins": 4.7373881340026855, "rewards/rejected": -5.350768566131592, "step": 3680 }, { "epoch": 1.9309262166405023, "grad_norm": 24.109588654694512, "learning_rate": 1.697692882194826e-07, "logits/chosen": -2.544801712036133, "logits/rejected": -2.5433974266052246, "logps/chosen": -236.13455200195312, "logps/rejected": -306.4762268066406, "loss": 0.0824, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6495293378829956, "rewards/margins": 5.061087608337402, "rewards/rejected": -5.710616588592529, "step": 3690 }, { "epoch": 1.936159079016222, "grad_norm": 27.927751296332747, "learning_rate": 1.6832892621263406e-07, "logits/chosen": -2.9226527214050293, "logits/rejected": -2.6958699226379395, "logps/chosen": -353.6954040527344, "logps/rejected": -371.58599853515625, "loss": 0.1095, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.03616750240325928, "rewards/margins": 5.9524006843566895, "rewards/rejected": -5.988568305969238, "step": 3700 }, { "epoch": 1.9413919413919414, "grad_norm": 10.612592863711846, "learning_rate": 1.668915927618183e-07, "logits/chosen": -2.614467144012451, "logits/rejected": -2.618373394012451, "logps/chosen": -222.18557739257812, "logps/rejected": -293.89935302734375, "loss": 0.0847, "rewards/accuracies": 1.0, "rewards/chosen": -0.7807853817939758, "rewards/margins": 4.5860748291015625, "rewards/rejected": -5.366860389709473, "step": 3710 }, { "epoch": 1.9466248037676608, "grad_norm": 8.767377121592377, "learning_rate": 1.6545734116674965e-07, "logits/chosen": -2.7589080333709717, "logits/rejected": -2.6732800006866455, "logps/chosen": -295.6678161621094, "logps/rejected": -279.3349609375, "loss": 0.1025, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.009974551387131214, "rewards/margins": 5.947516918182373, "rewards/rejected": -5.937542915344238, "step": 3720 }, { "epoch": 1.9518576661433804, "grad_norm": 27.89405099547432, "learning_rate": 1.6402622461286e-07, "logits/chosen": -2.6114070415496826, "logits/rejected": -2.5180039405822754, "logps/chosen": -314.15069580078125, "logps/rejected": -316.9871520996094, "loss": 0.0879, "rewards/accuracies": 1.0, "rewards/chosen": -0.7035598158836365, "rewards/margins": 5.666778087615967, "rewards/rejected": -6.370337963104248, "step": 3730 }, { "epoch": 1.9570905285191, "grad_norm": 28.15538164661059, "learning_rate": 1.625982961693262e-07, "logits/chosen": -2.8003592491149902, "logits/rejected": -2.5596506595611572, "logps/chosen": -346.68701171875, "logps/rejected": -297.2021789550781, "loss": 0.0817, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.3486330807209015, "rewards/margins": 5.3151984214782715, "rewards/rejected": -5.663832187652588, "step": 3740 }, { "epoch": 1.9623233908948194, "grad_norm": 41.38111514577999, "learning_rate": 1.6117360878710266e-07, "logits/chosen": -2.775566339492798, "logits/rejected": -2.5968217849731445, "logps/chosen": -321.11370849609375, "logps/rejected": -345.9370422363281, "loss": 0.116, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.368887722492218, "rewards/margins": 5.136349678039551, "rewards/rejected": -5.505237579345703, "step": 3750 }, { "epoch": 1.9675562532705388, "grad_norm": 24.28749177179343, "learning_rate": 1.5975221529695773e-07, "logits/chosen": -2.661539316177368, "logits/rejected": -2.532170057296753, "logps/chosen": -232.6519317626953, "logps/rejected": -238.15737915039062, "loss": 0.1139, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.823898196220398, "rewards/margins": 4.739381313323975, "rewards/rejected": -5.563279151916504, "step": 3760 }, { "epoch": 1.9727891156462585, "grad_norm": 23.092035590815094, "learning_rate": 1.5833416840751406e-07, "logits/chosen": -2.6281533241271973, "logits/rejected": -2.4063587188720703, "logps/chosen": -248.28378295898438, "logps/rejected": -231.88119506835938, "loss": 0.1124, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.9242329597473145, "rewards/margins": 4.783598899841309, "rewards/rejected": -5.707831382751465, "step": 3770 }, { "epoch": 1.978021978021978, "grad_norm": 33.814206997216125, "learning_rate": 1.5691952070329493e-07, "logits/chosen": -2.756333112716675, "logits/rejected": -2.683297634124756, "logps/chosen": -342.19439697265625, "logps/rejected": -398.38116455078125, "loss": 0.1152, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.2830882668495178, "rewards/margins": 5.8143815994262695, "rewards/rejected": -6.097469806671143, "step": 3780 }, { "epoch": 1.9832548403976975, "grad_norm": 32.594756198703855, "learning_rate": 1.555083246427734e-07, "logits/chosen": -2.586789846420288, "logits/rejected": -2.5754427909851074, "logps/chosen": -325.0731201171875, "logps/rejected": -351.6557922363281, "loss": 0.095, "rewards/accuracies": 1.0, "rewards/chosen": -0.6281405687332153, "rewards/margins": 5.96632194519043, "rewards/rejected": -6.594461917877197, "step": 3790 }, { "epoch": 1.988487702773417, "grad_norm": 27.78143850000216, "learning_rate": 1.5410063255642767e-07, "logits/chosen": -2.5559310913085938, "logits/rejected": -2.545598030090332, "logps/chosen": -280.82281494140625, "logps/rejected": -320.7041931152344, "loss": 0.0993, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4847874641418457, "rewards/margins": 5.9905595779418945, "rewards/rejected": -6.47534704208374, "step": 3800 }, { "epoch": 1.9937205651491365, "grad_norm": 9.345286196197668, "learning_rate": 1.5269649664480037e-07, "logits/chosen": -2.5976288318634033, "logits/rejected": -2.568420171737671, "logps/chosen": -328.0233459472656, "logps/rejected": -364.7568054199219, "loss": 0.1041, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7843068838119507, "rewards/margins": 5.072054386138916, "rewards/rejected": -5.856361389160156, "step": 3810 }, { "epoch": 1.9989534275248562, "grad_norm": 22.221429490070964, "learning_rate": 1.5129596897656255e-07, "logits/chosen": -2.5863964557647705, "logits/rejected": -2.479538917541504, "logps/chosen": -294.79095458984375, "logps/rejected": -299.70538330078125, "loss": 0.0743, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4342220425605774, "rewards/margins": 5.279918670654297, "rewards/rejected": -5.714139938354492, "step": 3820 }, { "epoch": 2.004186289900576, "grad_norm": 3.151471772421924, "learning_rate": 1.4989910148658324e-07, "logits/chosen": -2.7527716159820557, "logits/rejected": -2.647697925567627, "logps/chosen": -294.06201171875, "logps/rejected": -342.06939697265625, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": -0.6213226318359375, "rewards/margins": 5.365971565246582, "rewards/rejected": -5.987294673919678, "step": 3830 }, { "epoch": 2.009419152276295, "grad_norm": 5.945101789435311, "learning_rate": 1.485059459740035e-07, "logits/chosen": -2.6567201614379883, "logits/rejected": -2.5174503326416016, "logps/chosen": -315.97198486328125, "logps/rejected": -366.2234802246094, "loss": 0.0185, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6567188501358032, "rewards/margins": 6.468540191650391, "rewards/rejected": -7.1252593994140625, "step": 3840 }, { "epoch": 2.0146520146520146, "grad_norm": 4.294861211832759, "learning_rate": 1.4711655410031536e-07, "logits/chosen": -2.6191234588623047, "logits/rejected": -2.5405681133270264, "logps/chosen": -251.25662231445312, "logps/rejected": -297.3587951660156, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -0.7149181365966797, "rewards/margins": 6.6346235275268555, "rewards/rejected": -7.349541664123535, "step": 3850 }, { "epoch": 2.0198848770277342, "grad_norm": 7.023199575059913, "learning_rate": 1.4573097738744623e-07, "logits/chosen": -2.5686943531036377, "logits/rejected": -2.5673136711120605, "logps/chosen": -257.622802734375, "logps/rejected": -332.653564453125, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": -1.0377533435821533, "rewards/margins": 6.569632053375244, "rewards/rejected": -7.607385158538818, "step": 3860 }, { "epoch": 2.025117739403454, "grad_norm": 4.796659300478697, "learning_rate": 1.4434926721584865e-07, "logits/chosen": -2.654580593109131, "logits/rejected": -2.5008137226104736, "logps/chosen": -285.6098937988281, "logps/rejected": -343.22662353515625, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -1.090226650238037, "rewards/margins": 6.6490278244018555, "rewards/rejected": -7.739254951477051, "step": 3870 }, { "epoch": 2.030350601779173, "grad_norm": 2.7489006427714218, "learning_rate": 1.4297147482259424e-07, "logits/chosen": -2.642270803451538, "logits/rejected": -2.5516154766082764, "logps/chosen": -281.71600341796875, "logps/rejected": -305.82373046875, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -1.2659205198287964, "rewards/margins": 7.210963249206543, "rewards/rejected": -8.476883888244629, "step": 3880 }, { "epoch": 2.0355834641548927, "grad_norm": 1.494445263725559, "learning_rate": 1.4159765129947443e-07, "logits/chosen": -2.708300828933716, "logits/rejected": -2.6663706302642822, "logps/chosen": -259.70416259765625, "logps/rejected": -313.52825927734375, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": -1.073347806930542, "rewards/margins": 8.003740310668945, "rewards/rejected": -9.07708740234375, "step": 3890 }, { "epoch": 2.0408163265306123, "grad_norm": 3.496998555606532, "learning_rate": 1.4022784759110576e-07, "logits/chosen": -2.5498406887054443, "logits/rejected": -2.4592509269714355, "logps/chosen": -284.31732177734375, "logps/rejected": -343.39630126953125, "loss": 0.0118, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.5686169862747192, "rewards/margins": 6.356328010559082, "rewards/rejected": -7.9249444007873535, "step": 3900 }, { "epoch": 2.046049188906332, "grad_norm": 17.389942240056595, "learning_rate": 1.3886211449304002e-07, "logits/chosen": -2.5802905559539795, "logits/rejected": -2.6096935272216797, "logps/chosen": -255.0489044189453, "logps/rejected": -419.2701110839844, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -1.4543758630752563, "rewards/margins": 7.66497802734375, "rewards/rejected": -9.119354248046875, "step": 3910 }, { "epoch": 2.051282051282051, "grad_norm": 2.150445047937338, "learning_rate": 1.3750050264988172e-07, "logits/chosen": -2.548017740249634, "logits/rejected": -2.626844644546509, "logps/chosen": -199.0328369140625, "logps/rejected": -322.7221984863281, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -0.8127008676528931, "rewards/margins": 8.354021072387695, "rewards/rejected": -9.166723251342773, "step": 3920 }, { "epoch": 2.0565149136577707, "grad_norm": 1.660952211206376, "learning_rate": 1.3614306255340918e-07, "logits/chosen": -2.7289199829101562, "logits/rejected": -2.495772123336792, "logps/chosen": -294.88165283203125, "logps/rejected": -300.2463684082031, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": -0.5807745456695557, "rewards/margins": 7.85580587387085, "rewards/rejected": -8.4365816116333, "step": 3930 }, { "epoch": 2.0617477760334904, "grad_norm": 4.318158186232724, "learning_rate": 1.347898445407027e-07, "logits/chosen": -2.6153035163879395, "logits/rejected": -2.520585536956787, "logps/chosen": -312.289794921875, "logps/rejected": -370.9855041503906, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -0.25612935423851013, "rewards/margins": 8.64028263092041, "rewards/rejected": -8.896411895751953, "step": 3940 }, { "epoch": 2.06698063840921, "grad_norm": 2.821049446764142, "learning_rate": 1.3344089879227768e-07, "logits/chosen": -2.6195080280303955, "logits/rejected": -2.528824806213379, "logps/chosen": -330.6834411621094, "logps/rejected": -354.763671875, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -1.2032630443572998, "rewards/margins": 8.363024711608887, "rewards/rejected": -9.56628704071045, "step": 3950 }, { "epoch": 2.072213500784929, "grad_norm": 0.7524659808370527, "learning_rate": 1.3209627533022393e-07, "logits/chosen": -2.459660768508911, "logits/rejected": -2.45542311668396, "logps/chosen": -317.22442626953125, "logps/rejected": -374.64154052734375, "loss": 0.0087, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9240306615829468, "rewards/margins": 8.787074089050293, "rewards/rejected": -9.711104393005371, "step": 3960 }, { "epoch": 2.077446363160649, "grad_norm": 3.885112950468523, "learning_rate": 1.3075602401635056e-07, "logits/chosen": -2.5620055198669434, "logits/rejected": -2.480083465576172, "logps/chosen": -235.87051391601562, "logps/rejected": -239.4990692138672, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -2.0387330055236816, "rewards/margins": 6.51049280166626, "rewards/rejected": -8.549224853515625, "step": 3970 }, { "epoch": 2.0826792255363684, "grad_norm": 5.124178285131869, "learning_rate": 1.2942019455033715e-07, "logits/chosen": -2.595177412033081, "logits/rejected": -2.55249285697937, "logps/chosen": -363.89544677734375, "logps/rejected": -383.08050537109375, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -1.0965200662612915, "rewards/margins": 7.803582668304443, "rewards/rejected": -8.900102615356445, "step": 3980 }, { "epoch": 2.087912087912088, "grad_norm": 1.72781923689808, "learning_rate": 1.2808883646789088e-07, "logits/chosen": -2.6225199699401855, "logits/rejected": -2.5271828174591064, "logps/chosen": -278.67962646484375, "logps/rejected": -342.74578857421875, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -1.2907451391220093, "rewards/margins": 8.017008781433105, "rewards/rejected": -9.307754516601562, "step": 3990 }, { "epoch": 2.0931449502878072, "grad_norm": 2.3452944302465863, "learning_rate": 1.2676199913890933e-07, "logits/chosen": -2.4738712310791016, "logits/rejected": -2.35957670211792, "logps/chosen": -301.91607666015625, "logps/rejected": -312.5462951660156, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -1.5129525661468506, "rewards/margins": 6.101428031921387, "rewards/rejected": -7.614381313323975, "step": 4000 }, { "epoch": 2.0931449502878072, "eval_logits/chosen": -2.5508530139923096, "eval_logits/rejected": -2.4970502853393555, "eval_logps/chosen": -307.0093994140625, "eval_logps/rejected": -333.84161376953125, "eval_loss": 0.6922265887260437, "eval_rewards/accuracies": 0.78515625, "eval_rewards/chosen": -3.975245952606201, "eval_rewards/margins": 2.4245357513427734, "eval_rewards/rejected": -6.399781703948975, "eval_runtime": 95.5712, "eval_samples_per_second": 20.927, "eval_steps_per_second": 0.335, "step": 4000 }, { "epoch": 2.098377812663527, "grad_norm": 3.865042704199612, "learning_rate": 1.2543973176565012e-07, "logits/chosen": -2.5314152240753174, "logits/rejected": -2.4536736011505127, "logps/chosen": -235.0891571044922, "logps/rejected": -319.36614990234375, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": -2.152523994445801, "rewards/margins": 7.690751075744629, "rewards/rejected": -9.843273162841797, "step": 4010 }, { "epoch": 2.1036106750392465, "grad_norm": 7.733074515561227, "learning_rate": 1.2412208338090565e-07, "logits/chosen": -2.6669182777404785, "logits/rejected": -2.5965628623962402, "logps/chosen": -357.357177734375, "logps/rejected": -405.3863525390625, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -1.6724307537078857, "rewards/margins": 7.350949764251709, "rewards/rejected": -9.023381233215332, "step": 4020 }, { "epoch": 2.108843537414966, "grad_norm": 2.108764330623009, "learning_rate": 1.228091028461858e-07, "logits/chosen": -2.63122820854187, "logits/rejected": -2.5630502700805664, "logps/chosen": -276.1968994140625, "logps/rejected": -384.46856689453125, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -1.7323462963104248, "rewards/margins": 8.009744644165039, "rewards/rejected": -9.742091178894043, "step": 4030 }, { "epoch": 2.1140763997906853, "grad_norm": 5.205840373586514, "learning_rate": 1.2150083884990536e-07, "logits/chosen": -2.647475242614746, "logits/rejected": -2.5325350761413574, "logps/chosen": -296.84710693359375, "logps/rejected": -351.752197265625, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -2.0414936542510986, "rewards/margins": 7.303194999694824, "rewards/rejected": -9.34468936920166, "step": 4040 }, { "epoch": 2.119309262166405, "grad_norm": 3.7420471789718266, "learning_rate": 1.201973399055788e-07, "logits/chosen": -2.737910032272339, "logits/rejected": -2.6582093238830566, "logps/chosen": -336.29315185546875, "logps/rejected": -373.5148010253906, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -1.1442760229110718, "rewards/margins": 7.766120910644531, "rewards/rejected": -8.91039752960205, "step": 4050 }, { "epoch": 2.1245421245421245, "grad_norm": 1.4855886157432574, "learning_rate": 1.1889865435002117e-07, "logits/chosen": -2.7083852291107178, "logits/rejected": -2.6565423011779785, "logps/chosen": -302.7213134765625, "logps/rejected": -359.2713928222656, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -1.1232277154922485, "rewards/margins": 7.309815406799316, "rewards/rejected": -8.433042526245117, "step": 4060 }, { "epoch": 2.129774986917844, "grad_norm": 1.138207703913642, "learning_rate": 1.1760483034155588e-07, "logits/chosen": -2.6401944160461426, "logits/rejected": -2.5925869941711426, "logps/chosen": -286.7811279296875, "logps/rejected": -368.1828918457031, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -1.698175072669983, "rewards/margins": 8.620880126953125, "rewards/rejected": -10.31905460357666, "step": 4070 }, { "epoch": 2.1350078492935634, "grad_norm": 1.1583347223952818, "learning_rate": 1.163159158582284e-07, "logits/chosen": -2.4908974170684814, "logits/rejected": -2.490464448928833, "logps/chosen": -299.4886779785156, "logps/rejected": -369.92938232421875, "loss": 0.02, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.3759181499481201, "rewards/margins": 8.674256324768066, "rewards/rejected": -10.05017375946045, "step": 4080 }, { "epoch": 2.140240711669283, "grad_norm": 1.292595681436358, "learning_rate": 1.1503195869602766e-07, "logits/chosen": -2.5776543617248535, "logits/rejected": -2.437288522720337, "logps/chosen": -280.73193359375, "logps/rejected": -321.9732666015625, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -1.7706005573272705, "rewards/margins": 8.112302780151367, "rewards/rejected": -9.882905006408691, "step": 4090 }, { "epoch": 2.1454735740450026, "grad_norm": 3.1355222478979137, "learning_rate": 1.137530064671135e-07, "logits/chosen": -2.55924391746521, "logits/rejected": -2.634777545928955, "logps/chosen": -249.09994506835938, "logps/rejected": -356.64141845703125, "loss": 0.0156, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.4390732049942017, "rewards/margins": 7.6839189529418945, "rewards/rejected": -9.122990608215332, "step": 4100 }, { "epoch": 2.1507064364207222, "grad_norm": 5.68759855658989, "learning_rate": 1.1247910659805063e-07, "logits/chosen": -2.6641414165496826, "logits/rejected": -2.5627830028533936, "logps/chosen": -325.3441467285156, "logps/rejected": -298.83612060546875, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -1.3880575895309448, "rewards/margins": 7.806868553161621, "rewards/rejected": -9.194926261901855, "step": 4110 }, { "epoch": 2.155939298796442, "grad_norm": 1.9973676382803611, "learning_rate": 1.112103063280509e-07, "logits/chosen": -2.560502052307129, "logits/rejected": -2.4194540977478027, "logps/chosen": -265.48956298828125, "logps/rejected": -404.8538818359375, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -0.9298257827758789, "rewards/margins": 8.695016860961914, "rewards/rejected": -9.624841690063477, "step": 4120 }, { "epoch": 2.161172161172161, "grad_norm": 1.7882510195806205, "learning_rate": 1.099466527072207e-07, "logits/chosen": -2.56492018699646, "logits/rejected": -2.5750932693481445, "logps/chosen": -235.86630249023438, "logps/rejected": -365.64300537109375, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -1.7126439809799194, "rewards/margins": 8.225854873657227, "rewards/rejected": -9.938499450683594, "step": 4130 }, { "epoch": 2.1664050235478807, "grad_norm": 2.72661087691154, "learning_rate": 1.0868819259481638e-07, "logits/chosen": -2.5510354042053223, "logits/rejected": -2.3607094287872314, "logps/chosen": -303.17156982421875, "logps/rejected": -292.5484924316406, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -2.530797004699707, "rewards/margins": 7.4277496337890625, "rewards/rejected": -9.95854663848877, "step": 4140 }, { "epoch": 2.1716378859236003, "grad_norm": 2.48899655088663, "learning_rate": 1.0743497265750701e-07, "logits/chosen": -2.6701772212982178, "logits/rejected": -2.575355052947998, "logps/chosen": -289.3881530761719, "logps/rejected": -360.732177734375, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -1.7940318584442139, "rewards/margins": 7.533411502838135, "rewards/rejected": -9.327444076538086, "step": 4150 }, { "epoch": 2.17687074829932, "grad_norm": 5.329556069710662, "learning_rate": 1.0618703936764359e-07, "logits/chosen": -2.6774773597717285, "logits/rejected": -2.5098540782928467, "logps/chosen": -328.7174072265625, "logps/rejected": -393.78729248046875, "loss": 0.0151, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.827301502227783, "rewards/margins": 8.235798835754395, "rewards/rejected": -11.06309986114502, "step": 4160 }, { "epoch": 2.182103610675039, "grad_norm": 1.9008399468133936, "learning_rate": 1.0494443900153557e-07, "logits/chosen": -2.6789798736572266, "logits/rejected": -2.463243007659912, "logps/chosen": -317.5295715332031, "logps/rejected": -363.3247375488281, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": -1.1199839115142822, "rewards/margins": 8.223379135131836, "rewards/rejected": -9.343362808227539, "step": 4170 }, { "epoch": 2.1873364730507587, "grad_norm": 1.7033088321775203, "learning_rate": 1.0370721763773507e-07, "logits/chosen": -2.6211695671081543, "logits/rejected": -2.423337459564209, "logps/chosen": -352.43670654296875, "logps/rejected": -357.2591247558594, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -1.3701328039169312, "rewards/margins": 8.98823356628418, "rewards/rejected": -10.358366966247559, "step": 4180 }, { "epoch": 2.1925693354264784, "grad_norm": 20.353360868351782, "learning_rate": 1.0247542115532845e-07, "logits/chosen": -2.5802175998687744, "logits/rejected": -2.5106277465820312, "logps/chosen": -301.53375244140625, "logps/rejected": -362.0812072753906, "loss": 0.0196, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.9264440536499023, "rewards/margins": 8.778693199157715, "rewards/rejected": -10.705137252807617, "step": 4190 }, { "epoch": 2.197802197802198, "grad_norm": 10.710391123253114, "learning_rate": 1.0124909523223418e-07, "logits/chosen": -2.572200059890747, "logits/rejected": -2.5115675926208496, "logps/chosen": -312.39404296875, "logps/rejected": -369.1626892089844, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -1.4592777490615845, "rewards/margins": 8.952848434448242, "rewards/rejected": -10.412126541137695, "step": 4200 }, { "epoch": 2.203035060177917, "grad_norm": 1.4553015565286538, "learning_rate": 1.0002828534350987e-07, "logits/chosen": -2.6698861122131348, "logits/rejected": -2.534928321838379, "logps/chosen": -343.0860595703125, "logps/rejected": -360.8196716308594, "loss": 0.0188, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.8917086124420166, "rewards/margins": 7.229773044586182, "rewards/rejected": -9.121480941772461, "step": 4210 }, { "epoch": 2.208267922553637, "grad_norm": 3.801200126371489, "learning_rate": 9.881303675966524e-08, "logits/chosen": -2.6052310466766357, "logits/rejected": -2.4696848392486572, "logps/chosen": -296.0155334472656, "logps/rejected": -352.32928466796875, "loss": 0.009, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.727985143661499, "rewards/margins": 7.7842817306518555, "rewards/rejected": -9.512266159057617, "step": 4220 }, { "epoch": 2.2135007849293564, "grad_norm": 1.8608998334743396, "learning_rate": 9.760339454498393e-08, "logits/chosen": -2.4305007457733154, "logits/rejected": -2.4329895973205566, "logps/chosen": -256.9643859863281, "logps/rejected": -316.918701171875, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": -2.382755756378174, "rewards/margins": 7.783593654632568, "rewards/rejected": -10.166349411010742, "step": 4230 }, { "epoch": 2.218733647305076, "grad_norm": 3.9057026589225528, "learning_rate": 9.639940355585218e-08, "logits/chosen": -2.6919617652893066, "logits/rejected": -2.6409287452697754, "logps/chosen": -295.90374755859375, "logps/rejected": -379.12591552734375, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -2.4674360752105713, "rewards/margins": 7.371214866638184, "rewards/rejected": -9.838650703430176, "step": 4240 }, { "epoch": 2.2239665096807952, "grad_norm": 4.163349722355175, "learning_rate": 9.52011084390954e-08, "logits/chosen": -2.6314711570739746, "logits/rejected": -2.589735746383667, "logps/chosen": -290.67755126953125, "logps/rejected": -351.04119873046875, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -2.112610340118408, "rewards/margins": 7.909343719482422, "rewards/rejected": -10.021953582763672, "step": 4250 }, { "epoch": 2.229199372056515, "grad_norm": 11.426975588603128, "learning_rate": 9.400855363032262e-08, "logits/chosen": -2.6521573066711426, "logits/rejected": -2.6732687950134277, "logps/chosen": -317.30401611328125, "logps/rejected": -395.8109436035156, "loss": 0.0186, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.533034324645996, "rewards/margins": 8.412559509277344, "rewards/rejected": -9.945592880249023, "step": 4260 }, { "epoch": 2.2344322344322345, "grad_norm": 1.0643961722343311, "learning_rate": 9.282178335227883e-08, "logits/chosen": -2.6245615482330322, "logits/rejected": -2.5420284271240234, "logps/chosen": -278.80419921875, "logps/rejected": -372.5382995605469, "loss": 0.0108, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.101954936981201, "rewards/margins": 8.370219230651855, "rewards/rejected": -10.472173690795898, "step": 4270 }, { "epoch": 2.239665096807954, "grad_norm": 1.488877344015738, "learning_rate": 9.164084161320471e-08, "logits/chosen": -2.598118543624878, "logits/rejected": -2.4516539573669434, "logps/chosen": -281.11492919921875, "logps/rejected": -353.3921813964844, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": -1.5900779962539673, "rewards/margins": 9.468457221984863, "rewards/rejected": -11.058534622192383, "step": 4280 }, { "epoch": 2.2448979591836733, "grad_norm": 1.484123392002169, "learning_rate": 9.046577220520518e-08, "logits/chosen": -2.589380979537964, "logits/rejected": -2.49733567237854, "logps/chosen": -276.93499755859375, "logps/rejected": -347.20880126953125, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -2.0041606426239014, "rewards/margins": 7.949263095855713, "rewards/rejected": -9.953423500061035, "step": 4290 }, { "epoch": 2.250130821559393, "grad_norm": 18.225211308951582, "learning_rate": 8.929661870262525e-08, "logits/chosen": -2.7803542613983154, "logits/rejected": -2.62983775138855, "logps/chosen": -398.0860900878906, "logps/rejected": -380.4949951171875, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -1.7444343566894531, "rewards/margins": 7.942718505859375, "rewards/rejected": -9.687153816223145, "step": 4300 }, { "epoch": 2.2553636839351126, "grad_norm": 3.199591487607983, "learning_rate": 8.813342446043423e-08, "logits/chosen": -2.6458935737609863, "logits/rejected": -2.5061302185058594, "logps/chosen": -297.6692810058594, "logps/rejected": -316.699462890625, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": -2.22013521194458, "rewards/margins": 7.4093337059021, "rewards/rejected": -9.62946891784668, "step": 4310 }, { "epoch": 2.260596546310832, "grad_norm": 1.2640296355883016, "learning_rate": 8.697623261261788e-08, "logits/chosen": -2.575767755508423, "logits/rejected": -2.5581982135772705, "logps/chosen": -264.8335876464844, "logps/rejected": -372.76446533203125, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -1.0652940273284912, "rewards/margins": 9.945289611816406, "rewards/rejected": -11.010583877563477, "step": 4320 }, { "epoch": 2.2658294086865514, "grad_norm": 1.0437475906469669, "learning_rate": 8.58250860705792e-08, "logits/chosen": -2.742846965789795, "logits/rejected": -2.6475348472595215, "logps/chosen": -338.3741760253906, "logps/rejected": -381.1085510253906, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -1.54091477394104, "rewards/margins": 8.124018669128418, "rewards/rejected": -9.664934158325195, "step": 4330 }, { "epoch": 2.271062271062271, "grad_norm": 0.9690911735364153, "learning_rate": 8.468002752154671e-08, "logits/chosen": -2.7305939197540283, "logits/rejected": -2.57236385345459, "logps/chosen": -332.5419006347656, "logps/rejected": -361.2933044433594, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -1.5212883949279785, "rewards/margins": 8.489986419677734, "rewards/rejected": -10.011274337768555, "step": 4340 }, { "epoch": 2.2762951334379906, "grad_norm": 7.071080275710779, "learning_rate": 8.354109942699208e-08, "logits/chosen": -2.6169490814208984, "logits/rejected": -2.551487445831299, "logps/chosen": -297.1095886230469, "logps/rejected": -354.46722412109375, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -1.9308229684829712, "rewards/margins": 7.707949638366699, "rewards/rejected": -9.638772964477539, "step": 4350 }, { "epoch": 2.2815279958137102, "grad_norm": 0.6167801750895417, "learning_rate": 8.240834402105524e-08, "logits/chosen": -2.5586185455322266, "logits/rejected": -2.452181577682495, "logps/chosen": -327.6608581542969, "logps/rejected": -337.8875427246094, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -1.0531277656555176, "rewards/margins": 8.026483535766602, "rewards/rejected": -9.079609870910645, "step": 4360 }, { "epoch": 2.2867608581894294, "grad_norm": 4.717373542275004, "learning_rate": 8.128180330897791e-08, "logits/chosen": -2.503763437271118, "logits/rejected": -2.557732105255127, "logps/chosen": -310.40924072265625, "logps/rejected": -419.1622009277344, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -1.9562809467315674, "rewards/margins": 9.202638626098633, "rewards/rejected": -11.158918380737305, "step": 4370 }, { "epoch": 2.291993720565149, "grad_norm": 1.1978166487584516, "learning_rate": 8.016151906554683e-08, "logits/chosen": -2.621993064880371, "logits/rejected": -2.636842727661133, "logps/chosen": -289.68780517578125, "logps/rejected": -449.16424560546875, "loss": 0.015, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.7936582565307617, "rewards/margins": 8.596534729003906, "rewards/rejected": -10.390192985534668, "step": 4380 }, { "epoch": 2.2972265829408687, "grad_norm": 1.2996428679554775, "learning_rate": 7.90475328335439e-08, "logits/chosen": -2.605773448944092, "logits/rejected": -2.5295023918151855, "logps/chosen": -243.2452850341797, "logps/rejected": -304.86346435546875, "loss": 0.0127, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.1631274223327637, "rewards/margins": 7.450549125671387, "rewards/rejected": -9.613676071166992, "step": 4390 }, { "epoch": 2.3024594453165883, "grad_norm": 7.310470803121754, "learning_rate": 7.793988592220568e-08, "logits/chosen": -2.5795531272888184, "logits/rejected": -2.4490249156951904, "logps/chosen": -301.59979248046875, "logps/rejected": -335.76605224609375, "loss": 0.0189, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.096242904663086, "rewards/margins": 7.220043182373047, "rewards/rejected": -9.316286087036133, "step": 4400 }, { "epoch": 2.3076923076923075, "grad_norm": 4.938780046264363, "learning_rate": 7.683861940569217e-08, "logits/chosen": -2.605426073074341, "logits/rejected": -2.507072925567627, "logps/chosen": -361.077880859375, "logps/rejected": -363.7043762207031, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -2.157196521759033, "rewards/margins": 7.572789192199707, "rewards/rejected": -9.729986190795898, "step": 4410 }, { "epoch": 2.312925170068027, "grad_norm": 21.771326079556008, "learning_rate": 7.574377412156291e-08, "logits/chosen": -2.6205391883850098, "logits/rejected": -2.4058001041412354, "logps/chosen": -293.1470642089844, "logps/rejected": -329.00177001953125, "loss": 0.0222, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.156372547149658, "rewards/margins": 8.254606246948242, "rewards/rejected": -10.410979270935059, "step": 4420 }, { "epoch": 2.3181580324437467, "grad_norm": 2.3118779959043683, "learning_rate": 7.465539066926322e-08, "logits/chosen": -2.5743918418884277, "logits/rejected": -2.5314648151397705, "logps/chosen": -313.2865905761719, "logps/rejected": -342.49481201171875, "loss": 0.0159, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.2431182861328125, "rewards/margins": 8.129000663757324, "rewards/rejected": -10.372118949890137, "step": 4430 }, { "epoch": 2.3233908948194664, "grad_norm": 4.4751527013255705, "learning_rate": 7.357350940861845e-08, "logits/chosen": -2.6586811542510986, "logits/rejected": -2.613595962524414, "logps/chosen": -352.1698303222656, "logps/rejected": -448.161376953125, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -2.0548558235168457, "rewards/margins": 8.786008834838867, "rewards/rejected": -10.840865135192871, "step": 4440 }, { "epoch": 2.328623757195186, "grad_norm": 3.451212891606658, "learning_rate": 7.249817045833726e-08, "logits/chosen": -2.5706191062927246, "logits/rejected": -2.5019803047180176, "logps/chosen": -305.5481262207031, "logps/rejected": -333.54180908203125, "loss": 0.0169, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.8968453407287598, "rewards/margins": 7.913191318511963, "rewards/rejected": -10.810035705566406, "step": 4450 }, { "epoch": 2.333856619570905, "grad_norm": 1.017008288548852, "learning_rate": 7.14294136945241e-08, "logits/chosen": -2.5936567783355713, "logits/rejected": -2.5028891563415527, "logps/chosen": -313.73687744140625, "logps/rejected": -381.7586669921875, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -1.4407036304473877, "rewards/margins": 9.367916107177734, "rewards/rejected": -10.808621406555176, "step": 4460 }, { "epoch": 2.339089481946625, "grad_norm": 2.3105528237228374, "learning_rate": 7.036727874920043e-08, "logits/chosen": -2.4267191886901855, "logits/rejected": -2.4052541255950928, "logps/chosen": -292.8412780761719, "logps/rejected": -387.59417724609375, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -2.7293214797973633, "rewards/margins": 8.727388381958008, "rewards/rejected": -11.456710815429688, "step": 4470 }, { "epoch": 2.3443223443223444, "grad_norm": 3.1199674855344504, "learning_rate": 6.931180500883484e-08, "logits/chosen": -2.5494871139526367, "logits/rejected": -2.4779820442199707, "logps/chosen": -246.21481323242188, "logps/rejected": -301.2300720214844, "loss": 0.0094, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.211869716644287, "rewards/margins": 8.118663787841797, "rewards/rejected": -10.330533027648926, "step": 4480 }, { "epoch": 2.3495552066980636, "grad_norm": 2.05083376936891, "learning_rate": 6.826303161288302e-08, "logits/chosen": -2.451153039932251, "logits/rejected": -2.3369247913360596, "logps/chosen": -266.6675720214844, "logps/rejected": -341.8884582519531, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -2.5440659523010254, "rewards/margins": 8.815584182739258, "rewards/rejected": -11.359649658203125, "step": 4490 }, { "epoch": 2.3547880690737832, "grad_norm": 16.55836084559718, "learning_rate": 6.722099745233594e-08, "logits/chosen": -2.7528815269470215, "logits/rejected": -2.578997850418091, "logps/chosen": -346.63006591796875, "logps/rejected": -374.9027099609375, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -2.028027296066284, "rewards/margins": 8.339401245117188, "rewards/rejected": -10.367429733276367, "step": 4500 }, { "epoch": 2.360020931449503, "grad_norm": 6.028287869962129, "learning_rate": 6.618574116827786e-08, "logits/chosen": -2.593658924102783, "logits/rejected": -2.5544655323028564, "logps/chosen": -266.7757568359375, "logps/rejected": -348.6341247558594, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -2.2745323181152344, "rewards/margins": 8.218457221984863, "rewards/rejected": -10.492990493774414, "step": 4510 }, { "epoch": 2.3652537938252225, "grad_norm": 3.0886539584738575, "learning_rate": 6.515730115045339e-08, "logits/chosen": -2.684044361114502, "logits/rejected": -2.547943592071533, "logps/chosen": -336.9212951660156, "logps/rejected": -375.62701416015625, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -1.6338365077972412, "rewards/margins": 9.30997371673584, "rewards/rejected": -10.94381046295166, "step": 4520 }, { "epoch": 2.370486656200942, "grad_norm": 2.6957279498558315, "learning_rate": 6.413571553584399e-08, "logits/chosen": -2.5638465881347656, "logits/rejected": -2.4773478507995605, "logps/chosen": -296.7364196777344, "logps/rejected": -357.34027099609375, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -2.7816386222839355, "rewards/margins": 7.2836761474609375, "rewards/rejected": -10.065314292907715, "step": 4530 }, { "epoch": 2.3757195185766613, "grad_norm": 12.196071977619017, "learning_rate": 6.312102220725346e-08, "logits/chosen": -2.7183175086975098, "logits/rejected": -2.5329537391662598, "logps/chosen": -383.75970458984375, "logps/rejected": -392.5120849609375, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -2.3232245445251465, "rewards/margins": 8.773935317993164, "rewards/rejected": -11.097159385681152, "step": 4540 }, { "epoch": 2.380952380952381, "grad_norm": 3.2820692452777402, "learning_rate": 6.21132587919036e-08, "logits/chosen": -2.678323984146118, "logits/rejected": -2.569042444229126, "logps/chosen": -320.4778137207031, "logps/rejected": -380.771728515625, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -2.385861873626709, "rewards/margins": 7.826394081115723, "rewards/rejected": -10.212255477905273, "step": 4550 }, { "epoch": 2.3861852433281006, "grad_norm": 0.47087194855750114, "learning_rate": 6.111246266003859e-08, "logits/chosen": -2.5360188484191895, "logits/rejected": -2.4553494453430176, "logps/chosen": -358.38580322265625, "logps/rejected": -437.56103515625, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -2.868281364440918, "rewards/margins": 9.001848220825195, "rewards/rejected": -11.870129585266113, "step": 4560 }, { "epoch": 2.3914181057038197, "grad_norm": 2.4784234358738675, "learning_rate": 6.011867092353934e-08, "logits/chosen": -2.640843152999878, "logits/rejected": -2.4602251052856445, "logps/chosen": -320.490966796875, "logps/rejected": -328.3841247558594, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -2.0051512718200684, "rewards/margins": 8.660110473632812, "rewards/rejected": -10.665262222290039, "step": 4570 }, { "epoch": 2.3966509680795394, "grad_norm": 1.140894037098134, "learning_rate": 5.9131920434547235e-08, "logits/chosen": -2.5126101970672607, "logits/rejected": -2.5434188842773438, "logps/chosen": -365.4574279785156, "logps/rejected": -453.28094482421875, "loss": 0.0174, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.6535847187042236, "rewards/margins": 8.610297203063965, "rewards/rejected": -11.263882637023926, "step": 4580 }, { "epoch": 2.401883830455259, "grad_norm": 1.5548591343733114, "learning_rate": 5.8152247784097664e-08, "logits/chosen": -2.645655632019043, "logits/rejected": -2.5567336082458496, "logps/chosen": -361.2928771972656, "logps/rejected": -426.90740966796875, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -1.3525370359420776, "rewards/margins": 10.510432243347168, "rewards/rejected": -11.862970352172852, "step": 4590 }, { "epoch": 2.4071166928309786, "grad_norm": 1.5180709415239144, "learning_rate": 5.717968930076289e-08, "logits/chosen": -2.6549229621887207, "logits/rejected": -2.6011130809783936, "logps/chosen": -257.218505859375, "logps/rejected": -340.18231201171875, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -2.1633079051971436, "rewards/margins": 9.276578903198242, "rewards/rejected": -11.439887046813965, "step": 4600 }, { "epoch": 2.4123495552066982, "grad_norm": 0.7612063953273129, "learning_rate": 5.621428104930528e-08, "logits/chosen": -2.4110500812530518, "logits/rejected": -2.356020450592041, "logps/chosen": -236.77297973632812, "logps/rejected": -355.3910827636719, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -3.2140114307403564, "rewards/margins": 10.140518188476562, "rewards/rejected": -13.354528427124023, "step": 4610 }, { "epoch": 2.4175824175824174, "grad_norm": 3.6824412193515514, "learning_rate": 5.525605882933965e-08, "logits/chosen": -2.549696922302246, "logits/rejected": -2.5376381874084473, "logps/chosen": -297.99847412109375, "logps/rejected": -380.44793701171875, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -2.1107754707336426, "rewards/margins": 9.53602409362793, "rewards/rejected": -11.646799087524414, "step": 4620 }, { "epoch": 2.422815279958137, "grad_norm": 5.031761601034184, "learning_rate": 5.4305058174005853e-08, "logits/chosen": -2.4668195247650146, "logits/rejected": -2.4291744232177734, "logps/chosen": -418.4002990722656, "logps/rejected": -465.49273681640625, "loss": 0.0107, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6363046169281006, "rewards/margins": 10.962209701538086, "rewards/rejected": -12.598515510559082, "step": 4630 }, { "epoch": 2.4280481423338567, "grad_norm": 1.2710580407310335, "learning_rate": 5.33613143486511e-08, "logits/chosen": -2.5716395378112793, "logits/rejected": -2.3918890953063965, "logps/chosen": -351.7398986816406, "logps/rejected": -351.0377197265625, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -1.486716628074646, "rewards/margins": 9.630250930786133, "rewards/rejected": -11.116966247558594, "step": 4640 }, { "epoch": 2.4332810047095763, "grad_norm": 0.6204091483102194, "learning_rate": 5.242486234952206e-08, "logits/chosen": -2.5600039958953857, "logits/rejected": -2.4350457191467285, "logps/chosen": -319.65411376953125, "logps/rejected": -355.7321472167969, "loss": 0.011, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.7288930416107178, "rewards/margins": 8.188405990600586, "rewards/rejected": -10.917299270629883, "step": 4650 }, { "epoch": 2.4385138670852955, "grad_norm": 1.4435258368690114, "learning_rate": 5.149573690246758e-08, "logits/chosen": -2.5671067237854004, "logits/rejected": -2.4775681495666504, "logps/chosen": -341.93096923828125, "logps/rejected": -381.197998046875, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -2.316157341003418, "rewards/margins": 9.097368240356445, "rewards/rejected": -11.41352653503418, "step": 4660 }, { "epoch": 2.443746729461015, "grad_norm": 1.4347004985829015, "learning_rate": 5.057397246165052e-08, "logits/chosen": -2.5920205116271973, "logits/rejected": -2.477240800857544, "logps/chosen": -387.96722412109375, "logps/rejected": -392.7718200683594, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -2.3120877742767334, "rewards/margins": 9.547895431518555, "rewards/rejected": -11.859980583190918, "step": 4670 }, { "epoch": 2.4489795918367347, "grad_norm": 2.6229197053857294, "learning_rate": 4.9659603208270173e-08, "logits/chosen": -2.6845996379852295, "logits/rejected": -2.446159839630127, "logps/chosen": -382.69000244140625, "logps/rejected": -356.66473388671875, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -2.651026725769043, "rewards/margins": 7.654710292816162, "rewards/rejected": -10.305737495422363, "step": 4680 }, { "epoch": 2.4542124542124544, "grad_norm": 3.8843985711978384, "learning_rate": 4.875266304929496e-08, "logits/chosen": -2.411546230316162, "logits/rejected": -2.312110424041748, "logps/chosen": -259.7157897949219, "logps/rejected": -315.6478271484375, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -2.8747894763946533, "rewards/margins": 8.53257942199707, "rewards/rejected": -11.407368659973145, "step": 4690 }, { "epoch": 2.4594453165881736, "grad_norm": 5.646092509090637, "learning_rate": 4.785318561620511e-08, "logits/chosen": -2.464293956756592, "logits/rejected": -2.4637789726257324, "logps/chosen": -268.41290283203125, "logps/rejected": -385.8301696777344, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -3.5808568000793457, "rewards/margins": 9.485494613647461, "rewards/rejected": -13.066350936889648, "step": 4700 }, { "epoch": 2.464678178963893, "grad_norm": 2.1455296693383383, "learning_rate": 4.696120426374503e-08, "logits/chosen": -2.4258320331573486, "logits/rejected": -2.463759660720825, "logps/chosen": -279.572509765625, "logps/rejected": -390.50244140625, "loss": 0.0137, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.916973829269409, "rewards/margins": 9.4240083694458, "rewards/rejected": -12.340982437133789, "step": 4710 }, { "epoch": 2.469911041339613, "grad_norm": 36.72082911020149, "learning_rate": 4.607675206868705e-08, "logits/chosen": -2.6820778846740723, "logits/rejected": -2.558410882949829, "logps/chosen": -298.0652770996094, "logps/rejected": -333.7609558105469, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -2.282726764678955, "rewards/margins": 9.007135391235352, "rewards/rejected": -11.289862632751465, "step": 4720 }, { "epoch": 2.4751439037153324, "grad_norm": 2.745357634365693, "learning_rate": 4.519986182860452e-08, "logits/chosen": -2.567898988723755, "logits/rejected": -2.390460252761841, "logps/chosen": -315.73736572265625, "logps/rejected": -331.95635986328125, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -2.1501383781433105, "rewards/margins": 8.154587745666504, "rewards/rejected": -10.304725646972656, "step": 4730 }, { "epoch": 2.4803767660910516, "grad_norm": 5.338635929796716, "learning_rate": 4.433056606065552e-08, "logits/chosen": -2.5544776916503906, "logits/rejected": -2.521737575531006, "logps/chosen": -266.3556213378906, "logps/rejected": -366.9061279296875, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -2.6391263008117676, "rewards/margins": 8.17832088470459, "rewards/rejected": -10.817447662353516, "step": 4740 }, { "epoch": 2.4856096284667712, "grad_norm": 0.5914614001413402, "learning_rate": 4.3468897000377427e-08, "logits/chosen": -2.7590155601501465, "logits/rejected": -2.6252360343933105, "logps/chosen": -310.00030517578125, "logps/rejected": -352.8393249511719, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -1.7302494049072266, "rewards/margins": 8.767389297485352, "rewards/rejected": -10.497637748718262, "step": 4750 }, { "epoch": 2.490842490842491, "grad_norm": 5.805864346843536, "learning_rate": 4.2614886600491115e-08, "logits/chosen": -2.664672613143921, "logits/rejected": -2.5826516151428223, "logps/chosen": -305.0514831542969, "logps/rejected": -394.5857849121094, "loss": 0.0076, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.7474026679992676, "rewards/margins": 8.99746322631836, "rewards/rejected": -11.744866371154785, "step": 4760 }, { "epoch": 2.4960753532182105, "grad_norm": 2.946932792892311, "learning_rate": 4.1768566529716415e-08, "logits/chosen": -2.53653883934021, "logits/rejected": -2.5299153327941895, "logps/chosen": -270.1798095703125, "logps/rejected": -344.4693603515625, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -2.985710620880127, "rewards/margins": 8.387160301208496, "rewards/rejected": -11.372869491577148, "step": 4770 }, { "epoch": 2.50130821559393, "grad_norm": 1.5669617016018436, "learning_rate": 4.0929968171597526e-08, "logits/chosen": -2.5235400199890137, "logits/rejected": -2.4382717609405518, "logps/chosen": -299.82257080078125, "logps/rejected": -312.45953369140625, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -2.107891082763672, "rewards/margins": 9.178747177124023, "rewards/rejected": -11.286638259887695, "step": 4780 }, { "epoch": 2.5065410779696493, "grad_norm": 1.5503193629564926, "learning_rate": 4.009912262333942e-08, "logits/chosen": -2.63818621635437, "logits/rejected": -2.524019479751587, "logps/chosen": -290.4059753417969, "logps/rejected": -368.35968017578125, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": -3.5685508251190186, "rewards/margins": 9.202163696289062, "rewards/rejected": -12.770713806152344, "step": 4790 }, { "epoch": 2.511773940345369, "grad_norm": 2.4404410351397003, "learning_rate": 3.927606069465442e-08, "logits/chosen": -2.5241026878356934, "logits/rejected": -2.3206825256347656, "logps/chosen": -339.6543884277344, "logps/rejected": -364.41876220703125, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -3.0121209621429443, "rewards/margins": 9.030576705932617, "rewards/rejected": -12.042699813842773, "step": 4800 }, { "epoch": 2.5170068027210886, "grad_norm": 1.832113698737391, "learning_rate": 3.8460812906620037e-08, "logits/chosen": -2.6327362060546875, "logits/rejected": -2.5126953125, "logps/chosen": -330.55572509765625, "logps/rejected": -386.8505554199219, "loss": 0.0064, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.357273578643799, "rewards/margins": 8.54776668548584, "rewards/rejected": -10.905040740966797, "step": 4810 }, { "epoch": 2.5222396650968077, "grad_norm": 3.2729938935364786, "learning_rate": 3.765340949054696e-08, "logits/chosen": -2.5752532482147217, "logits/rejected": -2.4276397228240967, "logps/chosen": -348.06451416015625, "logps/rejected": -348.2574462890625, "loss": 0.0121, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.5180726051330566, "rewards/margins": 8.617886543273926, "rewards/rejected": -11.135960578918457, "step": 4820 }, { "epoch": 2.5274725274725274, "grad_norm": 2.5047863029948156, "learning_rate": 3.685388038685811e-08, "logits/chosen": -2.6267170906066895, "logits/rejected": -2.539578914642334, "logps/chosen": -384.8499755859375, "logps/rejected": -444.0081481933594, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -2.9967598915100098, "rewards/margins": 9.045783042907715, "rewards/rejected": -12.042542457580566, "step": 4830 }, { "epoch": 2.532705389848247, "grad_norm": 3.2326566350857786, "learning_rate": 3.60622552439783e-08, "logits/chosen": -2.4952991008758545, "logits/rejected": -2.404175043106079, "logps/chosen": -300.92340087890625, "logps/rejected": -385.161865234375, "loss": 0.0187, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.1940178871154785, "rewards/margins": 10.167230606079102, "rewards/rejected": -12.361248016357422, "step": 4840 }, { "epoch": 2.5379382522239666, "grad_norm": 6.726664707588461, "learning_rate": 3.527856341723479e-08, "logits/chosen": -2.4801971912384033, "logits/rejected": -2.4623141288757324, "logps/chosen": -258.33795166015625, "logps/rejected": -395.36358642578125, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": -4.240489482879639, "rewards/margins": 9.309966087341309, "rewards/rejected": -13.550456047058105, "step": 4850 }, { "epoch": 2.5431711145996863, "grad_norm": 6.788349753702106, "learning_rate": 3.4502833967768816e-08, "logits/chosen": -2.5217742919921875, "logits/rejected": -2.5007529258728027, "logps/chosen": -360.753662109375, "logps/rejected": -389.7902526855469, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -3.387969970703125, "rewards/margins": 8.528416633605957, "rewards/rejected": -11.916386604309082, "step": 4860 }, { "epoch": 2.5484039769754054, "grad_norm": 0.5141841992567054, "learning_rate": 3.373509566145793e-08, "logits/chosen": -2.5920393466949463, "logits/rejected": -2.4291844367980957, "logps/chosen": -409.9930725097656, "logps/rejected": -404.51812744140625, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": -3.0096020698547363, "rewards/margins": 9.102025985717773, "rewards/rejected": -12.111627578735352, "step": 4870 }, { "epoch": 2.553636839351125, "grad_norm": 1.9901574241674767, "learning_rate": 3.2975376967849104e-08, "logits/chosen": -2.604759931564331, "logits/rejected": -2.465873956680298, "logps/chosen": -292.3291015625, "logps/rejected": -366.34002685546875, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -2.3565335273742676, "rewards/margins": 8.622732162475586, "rewards/rejected": -10.979265213012695, "step": 4880 }, { "epoch": 2.5588697017268447, "grad_norm": 2.635976519171494, "learning_rate": 3.222370605910332e-08, "logits/chosen": -2.5512473583221436, "logits/rejected": -2.478119134902954, "logps/chosen": -325.674072265625, "logps/rejected": -369.12762451171875, "loss": 0.0117, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.2583580017089844, "rewards/margins": 9.271775245666504, "rewards/rejected": -11.530134201049805, "step": 4890 }, { "epoch": 2.564102564102564, "grad_norm": 9.016425567948197, "learning_rate": 3.1480110808950746e-08, "logits/chosen": -2.397136926651001, "logits/rejected": -2.472404956817627, "logps/chosen": -286.8371276855469, "logps/rejected": -421.8182067871094, "loss": 0.0138, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.3170464038848877, "rewards/margins": 9.02189826965332, "rewards/rejected": -12.338946342468262, "step": 4900 }, { "epoch": 2.5693354264782835, "grad_norm": 0.6713458458157762, "learning_rate": 3.07446187916568e-08, "logits/chosen": -2.6027634143829346, "logits/rejected": -2.5397934913635254, "logps/chosen": -318.99664306640625, "logps/rejected": -387.5476989746094, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -2.911219596862793, "rewards/margins": 8.30928897857666, "rewards/rejected": -11.220507621765137, "step": 4910 }, { "epoch": 2.574568288854003, "grad_norm": 1.1125319314771425, "learning_rate": 3.001725728100021e-08, "logits/chosen": -2.6207592487335205, "logits/rejected": -2.4813547134399414, "logps/chosen": -344.18505859375, "logps/rejected": -352.73004150390625, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -3.0130362510681152, "rewards/margins": 9.275091171264648, "rewards/rejected": -12.288125991821289, "step": 4920 }, { "epoch": 2.5798011512297228, "grad_norm": 5.299981453787648, "learning_rate": 2.9298053249261238e-08, "logits/chosen": -2.5177340507507324, "logits/rejected": -2.5581235885620117, "logps/chosen": -242.15701293945312, "logps/rejected": -332.94146728515625, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -3.712165117263794, "rewards/margins": 8.18096923828125, "rewards/rejected": -11.893133163452148, "step": 4930 }, { "epoch": 2.5850340136054424, "grad_norm": 4.723758865427596, "learning_rate": 2.8587033366221534e-08, "logits/chosen": -2.496917963027954, "logits/rejected": -2.480437994003296, "logps/chosen": -273.2696533203125, "logps/rejected": -365.79193115234375, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": -3.484445095062256, "rewards/margins": 8.450800895690918, "rewards/rejected": -11.935247421264648, "step": 4940 }, { "epoch": 2.5902668759811616, "grad_norm": 2.0339103072849096, "learning_rate": 2.7884223998175248e-08, "logits/chosen": -2.616367816925049, "logits/rejected": -2.547131061553955, "logps/chosen": -285.77667236328125, "logps/rejected": -393.36920166015625, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -3.509310245513916, "rewards/margins": 8.665257453918457, "rewards/rejected": -12.174566268920898, "step": 4950 }, { "epoch": 2.595499738356881, "grad_norm": 2.5244815062612562, "learning_rate": 2.718965120695141e-08, "logits/chosen": -2.6186976432800293, "logits/rejected": -2.637716054916382, "logps/chosen": -319.0390625, "logps/rejected": -411.00177001953125, "loss": 0.0179, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.704867362976074, "rewards/margins": 8.904386520385742, "rewards/rejected": -11.6092529296875, "step": 4960 }, { "epoch": 2.600732600732601, "grad_norm": 2.9436991950391382, "learning_rate": 2.6503340748947083e-08, "logits/chosen": -2.6223256587982178, "logits/rejected": -2.651494026184082, "logps/chosen": -322.9923095703125, "logps/rejected": -477.2806701660156, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -2.5072782039642334, "rewards/margins": 9.954888343811035, "rewards/rejected": -12.462165832519531, "step": 4970 }, { "epoch": 2.60596546310832, "grad_norm": 1.6420492628107515, "learning_rate": 2.5825318074172763e-08, "logits/chosen": -2.7298004627227783, "logits/rejected": -2.5805752277374268, "logps/chosen": -316.26580810546875, "logps/rejected": -383.5366516113281, "loss": 0.012, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.4945201873779297, "rewards/margins": 9.040359497070312, "rewards/rejected": -11.534879684448242, "step": 4980 }, { "epoch": 2.6111983254840396, "grad_norm": 2.9897115138402066, "learning_rate": 2.5155608325308358e-08, "logits/chosen": -2.6544556617736816, "logits/rejected": -2.496344804763794, "logps/chosen": -357.4461975097656, "logps/rejected": -405.1456298828125, "loss": 0.0087, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.524782180786133, "rewards/margins": 9.047494888305664, "rewards/rejected": -11.572277069091797, "step": 4990 }, { "epoch": 2.6164311878597593, "grad_norm": 1.2733440450810523, "learning_rate": 2.4494236336770695e-08, "logits/chosen": -2.658801555633545, "logits/rejected": -2.6000139713287354, "logps/chosen": -301.155029296875, "logps/rejected": -394.56976318359375, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -2.3924107551574707, "rewards/margins": 8.720831871032715, "rewards/rejected": -11.113243103027344, "step": 5000 }, { "epoch": 2.621664050235479, "grad_norm": 8.752405545719595, "learning_rate": 2.3841226633792983e-08, "logits/chosen": -2.5913498401641846, "logits/rejected": -2.4253532886505127, "logps/chosen": -346.916259765625, "logps/rejected": -367.08001708984375, "loss": 0.0064, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.553851842880249, "rewards/margins": 7.920147895812988, "rewards/rejected": -10.473998069763184, "step": 5010 }, { "epoch": 2.6268969126111985, "grad_norm": 5.937834540038537, "learning_rate": 2.319660343151511e-08, "logits/chosen": -2.606656551361084, "logits/rejected": -2.5195579528808594, "logps/chosen": -300.65838623046875, "logps/rejected": -324.0450439453125, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -2.727539539337158, "rewards/margins": 8.226611137390137, "rewards/rejected": -10.954151153564453, "step": 5020 }, { "epoch": 2.6321297749869177, "grad_norm": 4.19168135519844, "learning_rate": 2.2560390634085715e-08, "logits/chosen": -2.38765549659729, "logits/rejected": -2.407247543334961, "logps/chosen": -271.3846740722656, "logps/rejected": -430.45233154296875, "loss": 0.016, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.0975801944732666, "rewards/margins": 9.703760147094727, "rewards/rejected": -12.801342964172363, "step": 5030 }, { "epoch": 2.6373626373626373, "grad_norm": 4.438339211076942, "learning_rate": 2.1932611833775843e-08, "logits/chosen": -2.5761990547180176, "logits/rejected": -2.4454965591430664, "logps/chosen": -284.6046447753906, "logps/rejected": -365.8029479980469, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -2.3015778064727783, "rewards/margins": 10.762308120727539, "rewards/rejected": -13.063886642456055, "step": 5040 }, { "epoch": 2.642595499738357, "grad_norm": 9.842355468779951, "learning_rate": 2.1313290310103897e-08, "logits/chosen": -2.562697410583496, "logits/rejected": -2.459880828857422, "logps/chosen": -263.71429443359375, "logps/rejected": -355.89349365234375, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -3.0439720153808594, "rewards/margins": 8.155313491821289, "rewards/rejected": -11.199285507202148, "step": 5050 }, { "epoch": 2.647828362114076, "grad_norm": 1.3521199827350725, "learning_rate": 2.0702449028972696e-08, "logits/chosen": -2.510012626647949, "logits/rejected": -2.523059368133545, "logps/chosen": -317.8623962402344, "logps/rejected": -404.1944885253906, "loss": 0.0143, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.8070850372314453, "rewards/margins": 8.632943153381348, "rewards/rejected": -11.440028190612793, "step": 5060 }, { "epoch": 2.6530612244897958, "grad_norm": 1.463322874368762, "learning_rate": 2.0100110641817547e-08, "logits/chosen": -2.5922181606292725, "logits/rejected": -2.414249897003174, "logps/chosen": -339.21002197265625, "logps/rejected": -382.40435791015625, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -3.406454563140869, "rewards/margins": 10.262566566467285, "rewards/rejected": -13.66901969909668, "step": 5070 }, { "epoch": 2.6582940868655154, "grad_norm": 2.1574773610595863, "learning_rate": 1.9506297484766427e-08, "logits/chosen": -2.666602611541748, "logits/rejected": -2.5114593505859375, "logps/chosen": -420.3592834472656, "logps/rejected": -346.78863525390625, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -2.242121458053589, "rewards/margins": 9.135951042175293, "rewards/rejected": -11.378072738647461, "step": 5080 }, { "epoch": 2.663526949241235, "grad_norm": 36.010723824965844, "learning_rate": 1.8921031577811692e-08, "logits/chosen": -2.354315996170044, "logits/rejected": -2.2910566329956055, "logps/chosen": -299.4645080566406, "logps/rejected": -375.19195556640625, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -3.713761568069458, "rewards/margins": 8.878849983215332, "rewards/rejected": -12.592611312866211, "step": 5090 }, { "epoch": 2.6687598116169546, "grad_norm": 1.0986132222203888, "learning_rate": 1.834433462399351e-08, "logits/chosen": -2.6104464530944824, "logits/rejected": -2.4752371311187744, "logps/chosen": -317.61346435546875, "logps/rejected": -378.275390625, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -2.577064037322998, "rewards/margins": 8.46171760559082, "rewards/rejected": -11.03878116607666, "step": 5100 }, { "epoch": 2.6739926739926743, "grad_norm": 7.0005850905584275, "learning_rate": 1.7776228008594962e-08, "logits/chosen": -2.6077401638031006, "logits/rejected": -2.5730576515197754, "logps/chosen": -302.76171875, "logps/rejected": -442.8404846191406, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -2.5833659172058105, "rewards/margins": 9.618488311767578, "rewards/rejected": -12.201854705810547, "step": 5110 }, { "epoch": 2.6792255363683934, "grad_norm": 3.361896665859258, "learning_rate": 1.721673279834926e-08, "logits/chosen": -2.560586929321289, "logits/rejected": -2.4337735176086426, "logps/chosen": -304.4768371582031, "logps/rejected": -360.99249267578125, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -3.9418601989746094, "rewards/margins": 8.619203567504883, "rewards/rejected": -12.561063766479492, "step": 5120 }, { "epoch": 2.684458398744113, "grad_norm": 2.8782909461152175, "learning_rate": 1.666586974065831e-08, "logits/chosen": -2.588582754135132, "logits/rejected": -2.5598676204681396, "logps/chosen": -331.9123229980469, "logps/rejected": -444.1719665527344, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -2.416478395462036, "rewards/margins": 9.728887557983398, "rewards/rejected": -12.145366668701172, "step": 5130 }, { "epoch": 2.6896912611198327, "grad_norm": 3.06657213255932, "learning_rate": 1.6123659262823497e-08, "logits/chosen": -2.5465493202209473, "logits/rejected": -2.467817544937134, "logps/chosen": -324.09197998046875, "logps/rejected": -339.67120361328125, "loss": 0.0121, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.714784622192383, "rewards/margins": 8.506978988647461, "rewards/rejected": -11.221763610839844, "step": 5140 }, { "epoch": 2.694924123495552, "grad_norm": 10.956318628505112, "learning_rate": 1.5590121471288104e-08, "logits/chosen": -2.489410877227783, "logits/rejected": -2.5203540325164795, "logps/chosen": -240.01138305664062, "logps/rejected": -359.5928039550781, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -2.6659531593322754, "rewards/margins": 10.252822875976562, "rewards/rejected": -12.918774604797363, "step": 5150 }, { "epoch": 2.7001569858712715, "grad_norm": 4.341827826318645, "learning_rate": 1.5065276150891787e-08, "logits/chosen": -2.4828386306762695, "logits/rejected": -2.436096429824829, "logps/chosen": -274.3287658691406, "logps/rejected": -380.4012145996094, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -2.955925941467285, "rewards/margins": 9.629919052124023, "rewards/rejected": -12.585844993591309, "step": 5160 }, { "epoch": 2.705389848246991, "grad_norm": 2.168504147683603, "learning_rate": 1.4549142764136768e-08, "logits/chosen": -2.496258020401001, "logits/rejected": -2.3795084953308105, "logps/chosen": -290.3370056152344, "logps/rejected": -382.50799560546875, "loss": 0.0144, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.412745952606201, "rewards/margins": 9.241251945495605, "rewards/rejected": -12.653998374938965, "step": 5170 }, { "epoch": 2.7106227106227108, "grad_norm": 3.1888282613852583, "learning_rate": 1.4041740450466383e-08, "logits/chosen": -2.5063326358795166, "logits/rejected": -2.498027801513672, "logps/chosen": -305.488037109375, "logps/rejected": -390.25543212890625, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -3.42686128616333, "rewards/margins": 8.476984977722168, "rewards/rejected": -11.90384578704834, "step": 5180 }, { "epoch": 2.7158555729984304, "grad_norm": 1.5952711132379955, "learning_rate": 1.3543088025555094e-08, "logits/chosen": -2.4821343421936035, "logits/rejected": -2.4512901306152344, "logps/chosen": -296.63531494140625, "logps/rejected": -329.72296142578125, "loss": 0.0129, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.5127968788146973, "rewards/margins": 8.06564998626709, "rewards/rejected": -10.578447341918945, "step": 5190 }, { "epoch": 2.7210884353741496, "grad_norm": 1.4040294684460386, "learning_rate": 1.3053203980610744e-08, "logits/chosen": -2.442251682281494, "logits/rejected": -2.433336019515991, "logps/chosen": -353.4270935058594, "logps/rejected": -437.695556640625, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -2.559645175933838, "rewards/margins": 10.471541404724121, "rewards/rejected": -13.031187057495117, "step": 5200 }, { "epoch": 2.726321297749869, "grad_norm": 5.531095570318318, "learning_rate": 1.2572106481689243e-08, "logits/chosen": -2.5265042781829834, "logits/rejected": -2.406580686569214, "logps/chosen": -278.25384521484375, "logps/rejected": -334.8019714355469, "loss": 0.0164, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6399970054626465, "rewards/margins": 8.102693557739258, "rewards/rejected": -11.742691040039062, "step": 5210 }, { "epoch": 2.731554160125589, "grad_norm": 0.7569273137668125, "learning_rate": 1.2099813369020467e-08, "logits/chosen": -2.6435656547546387, "logits/rejected": -2.569716215133667, "logps/chosen": -326.84368896484375, "logps/rejected": -419.45440673828125, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -2.801252603530884, "rewards/margins": 8.470110893249512, "rewards/rejected": -11.271363258361816, "step": 5220 }, { "epoch": 2.736787022501308, "grad_norm": 1.015615968536147, "learning_rate": 1.1636342156346846e-08, "logits/chosen": -2.6142373085021973, "logits/rejected": -2.4517297744750977, "logps/chosen": -296.00433349609375, "logps/rejected": -368.3468017578125, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -3.2934257984161377, "rewards/margins": 8.836563110351562, "rewards/rejected": -12.129989624023438, "step": 5230 }, { "epoch": 2.7420198848770276, "grad_norm": 5.469101579410441, "learning_rate": 1.1181710030274043e-08, "logits/chosen": -2.3535382747650146, "logits/rejected": -2.2188241481781006, "logps/chosen": -249.70327758789062, "logps/rejected": -329.28094482421875, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -2.7749602794647217, "rewards/margins": 10.26407241821289, "rewards/rejected": -13.039031982421875, "step": 5240 }, { "epoch": 2.7472527472527473, "grad_norm": 1.8575488304333363, "learning_rate": 1.0735933849633561e-08, "logits/chosen": -2.6143882274627686, "logits/rejected": -2.464627504348755, "logps/chosen": -348.214111328125, "logps/rejected": -357.53399658203125, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -2.9914422035217285, "rewards/margins": 9.351872444152832, "rewards/rejected": -12.343315124511719, "step": 5250 }, { "epoch": 2.752485609628467, "grad_norm": 0.8727476983735399, "learning_rate": 1.0299030144857445e-08, "logits/chosen": -2.4906797409057617, "logits/rejected": -2.5234460830688477, "logps/chosen": -261.9267883300781, "logps/rejected": -377.07049560546875, "loss": 0.0109, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.1616902351379395, "rewards/margins": 10.171551704406738, "rewards/rejected": -13.333239555358887, "step": 5260 }, { "epoch": 2.7577184720041865, "grad_norm": 1.4211558713802048, "learning_rate": 9.871015117365516e-09, "logits/chosen": -2.536323070526123, "logits/rejected": -2.4908180236816406, "logps/chosen": -265.8121643066406, "logps/rejected": -334.70745849609375, "loss": 0.0102, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.533576488494873, "rewards/margins": 8.419042587280273, "rewards/rejected": -11.952618598937988, "step": 5270 }, { "epoch": 2.7629513343799057, "grad_norm": 1.6323427663620214, "learning_rate": 9.451904638964447e-09, "logits/chosen": -2.5910801887512207, "logits/rejected": -2.4406516551971436, "logps/chosen": -348.68878173828125, "logps/rejected": -361.7004089355469, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -2.7282471656799316, "rewards/margins": 8.851235389709473, "rewards/rejected": -11.579483032226562, "step": 5280 }, { "epoch": 2.7681841967556253, "grad_norm": 14.086043155752852, "learning_rate": 9.041714251259214e-09, "logits/chosen": -2.4262852668762207, "logits/rejected": -2.2455453872680664, "logps/chosen": -325.5466613769531, "logps/rejected": -377.5086364746094, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -3.007089376449585, "rewards/margins": 9.666694641113281, "rewards/rejected": -12.673784255981445, "step": 5290 }, { "epoch": 2.773417059131345, "grad_norm": 35.01863706023192, "learning_rate": 8.640459165076857e-09, "logits/chosen": -2.4602913856506348, "logits/rejected": -2.5747923851013184, "logps/chosen": -250.23727416992188, "logps/rejected": -391.7991638183594, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -3.3855373859405518, "rewards/margins": 9.337056159973145, "rewards/rejected": -12.722593307495117, "step": 5300 }, { "epoch": 2.778649921507064, "grad_norm": 6.363328775308636, "learning_rate": 8.248154259902246e-09, "logits/chosen": -2.5952470302581787, "logits/rejected": -2.371581554412842, "logps/chosen": -329.0868835449219, "logps/rejected": -326.04925537109375, "loss": 0.011, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.090206623077393, "rewards/margins": 7.8073248863220215, "rewards/rejected": -11.897531509399414, "step": 5310 }, { "epoch": 2.7838827838827838, "grad_norm": 0.9592032356713021, "learning_rate": 7.86481408332651e-09, "logits/chosen": -2.581714630126953, "logits/rejected": -2.4266419410705566, "logps/chosen": -266.65960693359375, "logps/rejected": -347.1474914550781, "loss": 0.0101, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.535461902618408, "rewards/margins": 7.9888787269592285, "rewards/rejected": -11.524340629577637, "step": 5320 }, { "epoch": 2.7891156462585034, "grad_norm": 1.8356349413284074, "learning_rate": 7.490452850507506e-09, "logits/chosen": -2.5398435592651367, "logits/rejected": -2.4736721515655518, "logps/chosen": -305.2430114746094, "logps/rejected": -341.65008544921875, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -3.192692995071411, "rewards/margins": 8.169621467590332, "rewards/rejected": -11.362314224243164, "step": 5330 }, { "epoch": 2.794348508634223, "grad_norm": 2.3463398326078164, "learning_rate": 7.1250844436426535e-09, "logits/chosen": -2.4499154090881348, "logits/rejected": -2.3645966053009033, "logps/chosen": -262.638671875, "logps/rejected": -363.5102233886719, "loss": 0.0125, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.4711711406707764, "rewards/margins": 10.573376655578613, "rewards/rejected": -14.044549942016602, "step": 5340 }, { "epoch": 2.7995813710099426, "grad_norm": 6.784060648739115, "learning_rate": 6.768722411454153e-09, "logits/chosen": -2.477108955383301, "logits/rejected": -2.440368890762329, "logps/chosen": -284.68536376953125, "logps/rejected": -355.6117858886719, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": -3.4868836402893066, "rewards/margins": 8.629325866699219, "rewards/rejected": -12.116209030151367, "step": 5350 }, { "epoch": 2.804814233385662, "grad_norm": 2.5718272328645564, "learning_rate": 6.421379968686663e-09, "logits/chosen": -2.6738717555999756, "logits/rejected": -2.4878640174865723, "logps/chosen": -408.24224853515625, "logps/rejected": -405.9677429199219, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -2.7108867168426514, "rewards/margins": 7.706369876861572, "rewards/rejected": -10.417256355285645, "step": 5360 }, { "epoch": 2.8100470957613815, "grad_norm": 0.5429034331939131, "learning_rate": 6.083069995617113e-09, "logits/chosen": -2.4864702224731445, "logits/rejected": -2.316953659057617, "logps/chosen": -293.7408142089844, "logps/rejected": -360.5414123535156, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -3.3920187950134277, "rewards/margins": 8.823633193969727, "rewards/rejected": -12.215652465820312, "step": 5370 }, { "epoch": 2.815279958137101, "grad_norm": 3.2205880935954916, "learning_rate": 5.753805037577192e-09, "logits/chosen": -2.3979969024658203, "logits/rejected": -2.4518043994903564, "logps/chosen": -286.2723083496094, "logps/rejected": -358.1659851074219, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -2.3278305530548096, "rewards/margins": 9.405172348022461, "rewards/rejected": -11.733002662658691, "step": 5380 }, { "epoch": 2.8205128205128203, "grad_norm": 4.313345968600854, "learning_rate": 5.433597304488113e-09, "logits/chosen": -2.5451204776763916, "logits/rejected": -2.3840582370758057, "logps/chosen": -340.4458923339844, "logps/rejected": -439.3768615722656, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": -2.964724063873291, "rewards/margins": 9.207846641540527, "rewards/rejected": -12.172571182250977, "step": 5390 }, { "epoch": 2.82574568288854, "grad_norm": 5.264269877259486, "learning_rate": 5.122458670407836e-09, "logits/chosen": -2.6159005165100098, "logits/rejected": -2.4100537300109863, "logps/chosen": -292.0515441894531, "logps/rejected": -290.3874206542969, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -3.151519775390625, "rewards/margins": 6.9995832443237305, "rewards/rejected": -10.151103973388672, "step": 5400 }, { "epoch": 2.8309785452642595, "grad_norm": 4.515185713671941, "learning_rate": 4.820400673090669e-09, "logits/chosen": -2.5118215084075928, "logits/rejected": -2.586270570755005, "logps/chosen": -360.7964782714844, "logps/rejected": -450.97003173828125, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -3.430558443069458, "rewards/margins": 10.006916999816895, "rewards/rejected": -13.437475204467773, "step": 5410 }, { "epoch": 2.836211407639979, "grad_norm": 3.1818990142647396, "learning_rate": 4.5274345135595525e-09, "logits/chosen": -2.603188991546631, "logits/rejected": -2.5455174446105957, "logps/chosen": -384.5435485839844, "logps/rejected": -441.33270263671875, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -2.382079601287842, "rewards/margins": 10.478212356567383, "rewards/rejected": -12.860292434692383, "step": 5420 }, { "epoch": 2.8414442700156988, "grad_norm": 2.7363660749645393, "learning_rate": 4.243571055690648e-09, "logits/chosen": -2.677584409713745, "logits/rejected": -2.6243412494659424, "logps/chosen": -386.9283752441406, "logps/rejected": -446.9391174316406, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -2.8604724407196045, "rewards/margins": 9.12932014465332, "rewards/rejected": -11.98979377746582, "step": 5430 }, { "epoch": 2.846677132391418, "grad_norm": 30.216614020048112, "learning_rate": 3.968820825810431e-09, "logits/chosen": -2.334294319152832, "logits/rejected": -2.228971242904663, "logps/chosen": -290.4869079589844, "logps/rejected": -327.9725646972656, "loss": 0.0152, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.2857768535614014, "rewards/margins": 8.18476676940918, "rewards/rejected": -11.47054386138916, "step": 5440 }, { "epoch": 2.8519099947671376, "grad_norm": 10.307838000491031, "learning_rate": 3.7031940123053997e-09, "logits/chosen": -2.4512031078338623, "logits/rejected": -2.3894190788269043, "logps/chosen": -277.6960754394531, "logps/rejected": -375.0935363769531, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -3.572457790374756, "rewards/margins": 8.654683113098145, "rewards/rejected": -12.227140426635742, "step": 5450 }, { "epoch": 2.857142857142857, "grad_norm": 1.1118530816981196, "learning_rate": 3.4467004652442842e-09, "logits/chosen": -2.4142355918884277, "logits/rejected": -2.3406777381896973, "logps/chosen": -250.6748504638672, "logps/rejected": -334.2477111816406, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -3.2360546588897705, "rewards/margins": 8.070561408996582, "rewards/rejected": -11.306615829467773, "step": 5460 }, { "epoch": 2.8623757195185764, "grad_norm": 7.408246277035073, "learning_rate": 3.1993496960127653e-09, "logits/chosen": -2.5402913093566895, "logits/rejected": -2.471750020980835, "logps/chosen": -267.28509521484375, "logps/rejected": -328.6303405761719, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -3.3718109130859375, "rewards/margins": 8.653961181640625, "rewards/rejected": -12.025772094726562, "step": 5470 }, { "epoch": 2.867608581894296, "grad_norm": 2.2244454243376888, "learning_rate": 2.9611508769606663e-09, "logits/chosen": -2.6168313026428223, "logits/rejected": -2.6207103729248047, "logps/chosen": -335.3636169433594, "logps/rejected": -391.04510498046875, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -3.4004509449005127, "rewards/margins": 7.857171535491943, "rewards/rejected": -11.257621765136719, "step": 5480 }, { "epoch": 2.8728414442700156, "grad_norm": 0.8061031545246838, "learning_rate": 2.7321128410620344e-09, "logits/chosen": -2.4226486682891846, "logits/rejected": -2.2207863330841064, "logps/chosen": -266.69989013671875, "logps/rejected": -311.1686096191406, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -3.3387820720672607, "rewards/margins": 7.805467128753662, "rewards/rejected": -11.144248008728027, "step": 5490 }, { "epoch": 2.8780743066457353, "grad_norm": 1.237817233976751, "learning_rate": 2.5122440815873724e-09, "logits/chosen": -2.54459547996521, "logits/rejected": -2.35339617729187, "logps/chosen": -372.35467529296875, "logps/rejected": -341.9369201660156, "loss": 0.0096, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.0608162879943848, "rewards/margins": 8.385961532592773, "rewards/rejected": -11.44677734375, "step": 5500 }, { "epoch": 2.883307169021455, "grad_norm": 1.4592021216320497, "learning_rate": 2.301552751788838e-09, "logits/chosen": -2.401576519012451, "logits/rejected": -2.4742298126220703, "logps/chosen": -301.2481689453125, "logps/rejected": -432.27459716796875, "loss": 0.0166, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.694408416748047, "rewards/margins": 9.245023727416992, "rewards/rejected": -11.939432144165039, "step": 5510 }, { "epoch": 2.8885400313971745, "grad_norm": 6.7253892662738854, "learning_rate": 2.1000466645978433e-09, "logits/chosen": -2.612203598022461, "logits/rejected": -2.5522124767303467, "logps/chosen": -252.40243530273438, "logps/rejected": -316.52178955078125, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -3.106694221496582, "rewards/margins": 7.8931169509887695, "rewards/rejected": -10.999811172485352, "step": 5520 }, { "epoch": 2.8937728937728937, "grad_norm": 2.521063730398925, "learning_rate": 1.9077332923353728e-09, "logits/chosen": -2.5552637577056885, "logits/rejected": -2.4974677562713623, "logps/chosen": -348.4877014160156, "logps/rejected": -410.57403564453125, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -2.973598003387451, "rewards/margins": 8.762843132019043, "rewards/rejected": -11.736442565917969, "step": 5530 }, { "epoch": 2.8990057561486133, "grad_norm": 1.8208189324306472, "learning_rate": 1.7246197664347872e-09, "logits/chosen": -2.682417392730713, "logits/rejected": -2.5886688232421875, "logps/chosen": -334.28973388671875, "logps/rejected": -496.99957275390625, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -2.5573782920837402, "rewards/margins": 9.289700508117676, "rewards/rejected": -11.847077369689941, "step": 5540 }, { "epoch": 2.904238618524333, "grad_norm": 1.8289670826499012, "learning_rate": 1.5507128771775346e-09, "logits/chosen": -2.4755916595458984, "logits/rejected": -2.3878397941589355, "logps/chosen": -306.183837890625, "logps/rejected": -390.5394287109375, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -3.636819362640381, "rewards/margins": 8.681589126586914, "rewards/rejected": -12.318408012390137, "step": 5550 }, { "epoch": 2.909471480900052, "grad_norm": 1.3092391514959099, "learning_rate": 1.3860190734411858e-09, "logits/chosen": -2.5638322830200195, "logits/rejected": -2.4192492961883545, "logps/chosen": -346.40765380859375, "logps/rejected": -422.3858947753906, "loss": 0.0161, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.551558256149292, "rewards/margins": 9.023459434509277, "rewards/rejected": -11.575017929077148, "step": 5560 }, { "epoch": 2.9147043432757718, "grad_norm": 6.938403304922241, "learning_rate": 1.2305444624604034e-09, "logits/chosen": -2.6479740142822266, "logits/rejected": -2.628225088119507, "logps/chosen": -338.06195068359375, "logps/rejected": -417.61456298828125, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -2.0014538764953613, "rewards/margins": 10.071534156799316, "rewards/rejected": -12.07298755645752, "step": 5570 }, { "epoch": 2.9199372056514914, "grad_norm": 1.5146419848534598, "learning_rate": 1.0842948096004835e-09, "logits/chosen": -2.5002403259277344, "logits/rejected": -2.4548568725585938, "logps/chosen": -284.13372802734375, "logps/rejected": -385.5391845703125, "loss": 0.0174, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2518792152404785, "rewards/margins": 8.641477584838867, "rewards/rejected": -11.893355369567871, "step": 5580 }, { "epoch": 2.925170068027211, "grad_norm": 7.445325284727672, "learning_rate": 9.472755381434161e-10, "logits/chosen": -2.4903171062469482, "logits/rejected": -2.287623167037964, "logps/chosen": -323.30670166015625, "logps/rejected": -314.6568603515625, "loss": 0.0135, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.513911008834839, "rewards/margins": 9.13040542602539, "rewards/rejected": -11.644315719604492, "step": 5590 }, { "epoch": 2.9304029304029307, "grad_norm": 2.602340314549884, "learning_rate": 8.194917290869907e-10, "logits/chosen": -2.5437402725219727, "logits/rejected": -2.448638439178467, "logps/chosen": -337.36602783203125, "logps/rejected": -394.43756103515625, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": -2.3356971740722656, "rewards/margins": 10.068342208862305, "rewards/rejected": -12.404041290283203, "step": 5600 }, { "epoch": 2.93563579277865, "grad_norm": 1.1068213174152017, "learning_rate": 7.009481209561685e-10, "logits/chosen": -2.5758707523345947, "logits/rejected": -2.5155177116394043, "logps/chosen": -259.2216491699219, "logps/rejected": -384.0183410644531, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -3.294078826904297, "rewards/margins": 9.293085098266602, "rewards/rejected": -12.587165832519531, "step": 5610 }, { "epoch": 2.9408686551543695, "grad_norm": 1.5286464770057508, "learning_rate": 5.916491096275845e-10, "logits/chosen": -2.636427402496338, "logits/rejected": -2.594538450241089, "logps/chosen": -328.97796630859375, "logps/rejected": -415.40582275390625, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -3.2140936851501465, "rewards/margins": 9.737319946289062, "rewards/rejected": -12.951414108276367, "step": 5620 }, { "epoch": 2.946101517530089, "grad_norm": 0.9575300257762934, "learning_rate": 4.915987481662887e-10, "logits/chosen": -2.4386096000671387, "logits/rejected": -2.3729395866394043, "logps/chosen": -261.83331298828125, "logps/rejected": -344.73443603515625, "loss": 0.0088, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.9984965324401855, "rewards/margins": 8.80710506439209, "rewards/rejected": -11.805601119995117, "step": 5630 }, { "epoch": 2.9513343799058083, "grad_norm": 0.7442669698182585, "learning_rate": 4.0080074667570017e-10, "logits/chosen": -2.5603861808776855, "logits/rejected": -2.483668327331543, "logps/chosen": -278.2349853515625, "logps/rejected": -418.85760498046875, "loss": 0.0118, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.313835859298706, "rewards/margins": 8.679365158081055, "rewards/rejected": -11.993200302124023, "step": 5640 }, { "epoch": 2.956567242281528, "grad_norm": 1.114513621058661, "learning_rate": 3.1925847215980017e-10, "logits/chosen": -2.604092836380005, "logits/rejected": -2.4918534755706787, "logps/chosen": -304.39678955078125, "logps/rejected": -384.1284484863281, "loss": 0.0147, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.602552652359009, "rewards/margins": 8.72242546081543, "rewards/rejected": -11.324975967407227, "step": 5650 }, { "epoch": 2.9618001046572475, "grad_norm": 10.222264124984395, "learning_rate": 2.469749483985095e-10, "logits/chosen": -2.5285439491271973, "logits/rejected": -2.419832944869995, "logps/chosen": -299.08013916015625, "logps/rejected": -385.5743408203125, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": -2.964160442352295, "rewards/margins": 9.242414474487305, "rewards/rejected": -12.206574440002441, "step": 5660 }, { "epoch": 2.967032967032967, "grad_norm": 8.039843420640908, "learning_rate": 1.8395285583530652e-10, "logits/chosen": -2.542475700378418, "logits/rejected": -2.422661066055298, "logps/chosen": -323.48297119140625, "logps/rejected": -365.52923583984375, "loss": 0.0133, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.63987398147583, "rewards/margins": 9.397886276245117, "rewards/rejected": -12.037759780883789, "step": 5670 }, { "epoch": 2.9722658294086868, "grad_norm": 1.4255702175275826, "learning_rate": 1.3019453147805614e-10, "logits/chosen": -2.5691847801208496, "logits/rejected": -2.424009084701538, "logps/chosen": -312.40118408203125, "logps/rejected": -402.6890563964844, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -3.699326992034912, "rewards/margins": 9.890931129455566, "rewards/rejected": -13.59025764465332, "step": 5680 }, { "epoch": 2.977498691784406, "grad_norm": 2.8176122633799983, "learning_rate": 8.570196881216297e-11, "logits/chosen": -2.3424875736236572, "logits/rejected": -2.354027271270752, "logps/chosen": -256.85028076171875, "logps/rejected": -372.85394287109375, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -2.546224355697632, "rewards/margins": 9.345178604125977, "rewards/rejected": -11.891403198242188, "step": 5690 }, { "epoch": 2.9827315541601256, "grad_norm": 2.0227865741921054, "learning_rate": 5.0476817726852194e-11, "logits/chosen": -2.490391254425049, "logits/rejected": -2.5428659915924072, "logps/chosen": -343.39263916015625, "logps/rejected": -448.39495849609375, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": -3.258328914642334, "rewards/margins": 9.148773193359375, "rewards/rejected": -12.407099723815918, "step": 5700 }, { "epoch": 2.987964416535845, "grad_norm": 0.8541881834134185, "learning_rate": 2.4520384453746712e-11, "logits/chosen": -2.4259555339813232, "logits/rejected": -2.4047558307647705, "logps/chosen": -341.0238037109375, "logps/rejected": -441.8011779785156, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -3.361252546310425, "rewards/margins": 9.091890335083008, "rewards/rejected": -12.453143119812012, "step": 5710 }, { "epoch": 2.9931972789115644, "grad_norm": 1.8692261524092926, "learning_rate": 7.833631518627815e-12, "logits/chosen": -2.3938581943511963, "logits/rejected": -2.3685402870178223, "logps/chosen": -296.4942626953125, "logps/rejected": -387.60223388671875, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -3.046889305114746, "rewards/margins": 9.50677490234375, "rewards/rejected": -12.553665161132812, "step": 5720 }, { "epoch": 2.998430141287284, "grad_norm": 5.334919186361271, "learning_rate": 4.1717770565830033e-13, "logits/chosen": -2.628361940383911, "logits/rejected": -2.5328991413116455, "logps/chosen": -305.0509033203125, "logps/rejected": -323.2393493652344, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": -2.800204277038574, "rewards/margins": 7.784017086029053, "rewards/rejected": -10.584222793579102, "step": 5730 }, { "epoch": 3.0, "step": 5733, "total_flos": 0.0, "train_loss": 0.22836191894055405, "train_runtime": 34822.6853, "train_samples_per_second": 5.267, "train_steps_per_second": 0.165 } ], "logging_steps": 10, "max_steps": 5733, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }