{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 5.235269989993057, "learning_rate": 2.6737967914438506e-08, "logits/chosen": -0.07364606857299805, "logits/rejected": 0.1362065076828003, "logps/chosen": -1.715688705444336, "logps/rejected": -1.8891627788543701, "loss": 0.8421, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.715688705444336, "rewards/margins": 0.17347395420074463, "rewards/rejected": -1.8891627788543701, "sft_loss": 1.4681576490402222, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 9.85463554243418, "learning_rate": 5.347593582887701e-08, "logits/chosen": -0.007629724685102701, "logits/rejected": 0.11350151151418686, "logps/chosen": -1.80299973487854, "logps/rejected": -1.8473634719848633, "loss": 0.9279, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.80299973487854, "rewards/margins": 0.04436371102929115, "rewards/rejected": -1.8473634719848633, "sft_loss": 1.508274793624878, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 11.515063279361502, "learning_rate": 8.021390374331551e-08, "logits/chosen": -0.050660718232393265, "logits/rejected": 0.04635730758309364, "logps/chosen": -1.6353628635406494, "logps/rejected": -1.7657358646392822, "loss": 0.9022, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6353628635406494, "rewards/margins": 0.13037298619747162, "rewards/rejected": -1.7657358646392822, "sft_loss": 1.5009872913360596, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 5.258926310151922, "learning_rate": 1.0695187165775402e-07, "logits/chosen": -0.06183786317706108, "logits/rejected": 0.023231148719787598, "logps/chosen": -1.7252362966537476, "logps/rejected": -1.8057111501693726, "loss": 0.9261, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.7252362966537476, "rewards/margins": 0.08047479391098022, "rewards/rejected": -1.8057111501693726, "sft_loss": 1.5002777576446533, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 16.53074244289573, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.06942118704319, "logits/rejected": 0.015559596940875053, "logps/chosen": -1.8668476343154907, "logps/rejected": -1.7766830921173096, "loss": 1.0363, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -1.8668476343154907, "rewards/margins": -0.09016446769237518, "rewards/rejected": -1.7766830921173096, "sft_loss": 1.5445544719696045, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 9.275972420130646, "learning_rate": 1.6042780748663102e-07, "logits/chosen": -0.09946934133768082, "logits/rejected": -0.002832284662872553, "logps/chosen": -1.9050594568252563, "logps/rejected": -1.8290824890136719, "loss": 0.9982, "rewards/accuracies": 0.4375, "rewards/chosen": -1.9050594568252563, "rewards/margins": -0.07597692310810089, "rewards/rejected": -1.8290824890136719, "sft_loss": 1.644878625869751, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 10.27616549684039, "learning_rate": 1.8716577540106952e-07, "logits/chosen": -0.05647529289126396, "logits/rejected": 0.10369857400655746, "logps/chosen": -1.8399736881256104, "logps/rejected": -1.989367127418518, "loss": 0.9535, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.8399736881256104, "rewards/margins": 0.1493932157754898, "rewards/rejected": -1.989367127418518, "sft_loss": 1.559540867805481, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 9.47278456909826, "learning_rate": 2.1390374331550805e-07, "logits/chosen": 0.03216688707470894, "logits/rejected": 0.20794229209423065, "logps/chosen": -1.8674113750457764, "logps/rejected": -1.731610655784607, "loss": 1.0019, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.8674113750457764, "rewards/margins": -0.13580094277858734, "rewards/rejected": -1.731610655784607, "sft_loss": 1.5152695178985596, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 15.430823640809635, "learning_rate": 2.4064171122994655e-07, "logits/chosen": 0.007911334745585918, "logits/rejected": 0.2036576271057129, "logps/chosen": -1.8118816614151, "logps/rejected": -1.8485628366470337, "loss": 0.9646, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.8118816614151, "rewards/margins": 0.036681193858385086, "rewards/rejected": -1.8485628366470337, "sft_loss": 1.5256637334823608, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 11.882954872936795, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.049845270812511444, "logits/rejected": 0.10112782567739487, "logps/chosen": -1.8521621227264404, "logps/rejected": -1.7432807683944702, "loss": 1.0101, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.8521621227264404, "rewards/margins": -0.10888160765171051, "rewards/rejected": -1.7432807683944702, "sft_loss": 1.5658760070800781, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 7.9737766908357575, "learning_rate": 2.9411764705882356e-07, "logits/chosen": -0.10056047141551971, "logits/rejected": 0.12481649219989777, "logps/chosen": -1.7834703922271729, "logps/rejected": -1.822126030921936, "loss": 0.9768, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.7834703922271729, "rewards/margins": 0.03865557536482811, "rewards/rejected": -1.822126030921936, "sft_loss": 1.5616546869277954, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 7.225835353625987, "learning_rate": 3.2085561497326203e-07, "logits/chosen": -0.08353086560964584, "logits/rejected": 0.10854407399892807, "logps/chosen": -1.7223520278930664, "logps/rejected": -1.8239339590072632, "loss": 0.9, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.7223520278930664, "rewards/margins": 0.10158195346593857, "rewards/rejected": -1.8239339590072632, "sft_loss": 1.5241292715072632, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 5.431638210110103, "learning_rate": 3.475935828877005e-07, "logits/chosen": -0.029154837131500244, "logits/rejected": 0.11996859312057495, "logps/chosen": -1.5424885749816895, "logps/rejected": -1.6564009189605713, "loss": 0.8524, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.5424885749816895, "rewards/margins": 0.1139124184846878, "rewards/rejected": -1.6564009189605713, "sft_loss": 1.4341975450515747, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 11.820639617057315, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.07852429151535034, "logits/rejected": 0.07054585218429565, "logps/chosen": -1.6553970575332642, "logps/rejected": -1.6938140392303467, "loss": 0.9501, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -1.6553970575332642, "rewards/margins": 0.03841722011566162, "rewards/rejected": -1.6938140392303467, "sft_loss": 1.567211389541626, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 12.009311638296472, "learning_rate": 4.0106951871657757e-07, "logits/chosen": -0.07746043056249619, "logits/rejected": 0.10344438254833221, "logps/chosen": -1.619264006614685, "logps/rejected": -1.859872579574585, "loss": 0.8358, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.619264006614685, "rewards/margins": 0.24060864746570587, "rewards/rejected": -1.859872579574585, "sft_loss": 1.5058908462524414, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 7.635140975991649, "learning_rate": 4.278074866310161e-07, "logits/chosen": -0.004299764521420002, "logits/rejected": 0.10244850069284439, "logps/chosen": -1.5427652597427368, "logps/rejected": -1.5827980041503906, "loss": 0.887, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5427652597427368, "rewards/margins": 0.04003264755010605, "rewards/rejected": -1.5827980041503906, "sft_loss": 1.4448814392089844, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 6.061565243102024, "learning_rate": 4.5454545454545457e-07, "logits/chosen": -0.18546968698501587, "logits/rejected": 0.0526907853782177, "logps/chosen": -1.587787389755249, "logps/rejected": -1.7247536182403564, "loss": 0.8549, "rewards/accuracies": 0.5, "rewards/chosen": -1.587787389755249, "rewards/margins": 0.13696610927581787, "rewards/rejected": -1.7247536182403564, "sft_loss": 1.424576759338379, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 13.366616211715524, "learning_rate": 4.812834224598931e-07, "logits/chosen": 0.05517455190420151, "logits/rejected": 0.015907561406493187, "logps/chosen": -1.5135414600372314, "logps/rejected": -1.592010498046875, "loss": 0.8748, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.5135414600372314, "rewards/margins": 0.07846912741661072, "rewards/rejected": -1.592010498046875, "sft_loss": 1.3922526836395264, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 4.65872642986279, "learning_rate": 5.080213903743315e-07, "logits/chosen": -0.10109523683786392, "logits/rejected": 0.04699419066309929, "logps/chosen": -1.4085562229156494, "logps/rejected": -1.604548454284668, "loss": 0.8126, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4085562229156494, "rewards/margins": 0.19599218666553497, "rewards/rejected": -1.604548454284668, "sft_loss": 1.385668396949768, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 5.225165770190776, "learning_rate": 5.347593582887701e-07, "logits/chosen": -0.08418619632720947, "logits/rejected": -0.020208898931741714, "logps/chosen": -1.3913484811782837, "logps/rejected": -1.4861876964569092, "loss": 0.8452, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3913484811782837, "rewards/margins": 0.09483925253152847, "rewards/rejected": -1.4861876964569092, "sft_loss": 1.3743575811386108, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 4.179439042444936, "learning_rate": 5.614973262032086e-07, "logits/chosen": -0.023166431114077568, "logits/rejected": 0.0029010644648224115, "logps/chosen": -1.3319205045700073, "logps/rejected": -1.506927251815796, "loss": 0.8006, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3319205045700073, "rewards/margins": 0.17500664293766022, "rewards/rejected": -1.506927251815796, "sft_loss": 1.3089672327041626, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 7.120176078055153, "learning_rate": 5.882352941176471e-07, "logits/chosen": -0.07467556744813919, "logits/rejected": 0.018729006871581078, "logps/chosen": -1.2951467037200928, "logps/rejected": -1.3635486364364624, "loss": 0.8459, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2951467037200928, "rewards/margins": 0.06840179860591888, "rewards/rejected": -1.3635486364364624, "sft_loss": 1.297489881515503, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 6.7501369743432615, "learning_rate": 6.149732620320855e-07, "logits/chosen": -0.05421806126832962, "logits/rejected": 0.13666459918022156, "logps/chosen": -1.3326265811920166, "logps/rejected": -1.5337562561035156, "loss": 0.8006, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3326265811920166, "rewards/margins": 0.2011294811964035, "rewards/rejected": -1.5337562561035156, "sft_loss": 1.3840888738632202, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 5.809627442148684, "learning_rate": 6.417112299465241e-07, "logits/chosen": -0.16003762185573578, "logits/rejected": -0.007819685153663158, "logps/chosen": -1.3354389667510986, "logps/rejected": -1.4167028665542603, "loss": 0.8465, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3354389667510986, "rewards/margins": 0.08126381039619446, "rewards/rejected": -1.4167028665542603, "sft_loss": 1.3464367389678955, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 4.5054154445634556, "learning_rate": 6.684491978609626e-07, "logits/chosen": -0.16295680403709412, "logits/rejected": -0.038801293820142746, "logps/chosen": -1.3474639654159546, "logps/rejected": -1.3577622175216675, "loss": 0.8773, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3474639654159546, "rewards/margins": 0.010298268869519234, "rewards/rejected": -1.3577622175216675, "sft_loss": 1.4015753269195557, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 5.519825401039347, "learning_rate": 6.95187165775401e-07, "logits/chosen": -0.03140866383910179, "logits/rejected": 0.09492513537406921, "logps/chosen": -1.382698655128479, "logps/rejected": -1.499145746231079, "loss": 0.8341, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.382698655128479, "rewards/margins": 0.11644729226827621, "rewards/rejected": -1.499145746231079, "sft_loss": 1.4506919384002686, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 7.089184282498258, "learning_rate": 7.219251336898395e-07, "logits/chosen": -0.11524300277233124, "logits/rejected": -0.008447563275694847, "logps/chosen": -1.4253406524658203, "logps/rejected": -1.5112718343734741, "loss": 0.8615, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.4253406524658203, "rewards/margins": 0.08593113720417023, "rewards/rejected": -1.5112718343734741, "sft_loss": 1.373755693435669, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 7.494836982244487, "learning_rate": 7.486631016042781e-07, "logits/chosen": -0.09514065831899643, "logits/rejected": 0.05069103091955185, "logps/chosen": -1.4329020977020264, "logps/rejected": -1.4857370853424072, "loss": 0.8719, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.4329020977020264, "rewards/margins": 0.05283506587147713, "rewards/rejected": -1.4857370853424072, "sft_loss": 1.4442007541656494, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 8.249196871188497, "learning_rate": 7.754010695187166e-07, "logits/chosen": -0.08438152074813843, "logits/rejected": 0.05761206895112991, "logps/chosen": -1.3430602550506592, "logps/rejected": -1.4420830011367798, "loss": 0.8477, "rewards/accuracies": 0.5, "rewards/chosen": -1.3430602550506592, "rewards/margins": 0.09902272373437881, "rewards/rejected": -1.4420830011367798, "sft_loss": 1.4123352766036987, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 8.009989628674303, "learning_rate": 8.021390374331551e-07, "logits/chosen": -0.12797826528549194, "logits/rejected": 0.01398629229515791, "logps/chosen": -1.283252239227295, "logps/rejected": -1.3031189441680908, "loss": 0.8647, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.283252239227295, "rewards/margins": 0.01986684277653694, "rewards/rejected": -1.3031189441680908, "sft_loss": 1.2768657207489014, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 5.021232147599984, "learning_rate": 8.288770053475937e-07, "logits/chosen": -0.12843099236488342, "logits/rejected": -0.0827844962477684, "logps/chosen": -1.298471212387085, "logps/rejected": -1.4227102994918823, "loss": 0.825, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.298471212387085, "rewards/margins": 0.12423906475305557, "rewards/rejected": -1.4227102994918823, "sft_loss": 1.3338967561721802, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 5.036413154312052, "learning_rate": 8.556149732620322e-07, "logits/chosen": -0.19850830733776093, "logits/rejected": -0.06920308619737625, "logps/chosen": -1.3978339433670044, "logps/rejected": -1.3811728954315186, "loss": 0.9003, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.3978339433670044, "rewards/margins": -0.016660813242197037, "rewards/rejected": -1.3811728954315186, "sft_loss": 1.3823540210723877, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 8.480938331155045, "learning_rate": 8.823529411764706e-07, "logits/chosen": -0.07313909381628036, "logits/rejected": 0.09209281206130981, "logps/chosen": -1.3041568994522095, "logps/rejected": -1.3962047100067139, "loss": 0.8479, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3041568994522095, "rewards/margins": 0.09204794466495514, "rewards/rejected": -1.3962047100067139, "sft_loss": 1.2982127666473389, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 9.899406966050948, "learning_rate": 9.090909090909091e-07, "logits/chosen": -0.12866242229938507, "logits/rejected": -0.07856561988592148, "logps/chosen": -1.4354071617126465, "logps/rejected": -1.5156786441802979, "loss": 0.8656, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.4354071617126465, "rewards/margins": 0.08027160167694092, "rewards/rejected": -1.5156786441802979, "sft_loss": 1.4268556833267212, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 7.339696230235005, "learning_rate": 9.358288770053477e-07, "logits/chosen": 0.010467484593391418, "logits/rejected": 0.01038367860019207, "logps/chosen": -1.3264710903167725, "logps/rejected": -1.4309437274932861, "loss": 0.8403, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3264710903167725, "rewards/margins": 0.10447286069393158, "rewards/rejected": -1.4309437274932861, "sft_loss": 1.357232928276062, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 6.350510580581484, "learning_rate": 9.625668449197862e-07, "logits/chosen": -0.0297098346054554, "logits/rejected": -0.030283737927675247, "logps/chosen": -1.3521928787231445, "logps/rejected": -1.5571117401123047, "loss": 0.8384, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3521928787231445, "rewards/margins": 0.2049189805984497, "rewards/rejected": -1.5571117401123047, "sft_loss": 1.3775081634521484, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 6.445053054684298, "learning_rate": 9.893048128342246e-07, "logits/chosen": -0.14473792910575867, "logits/rejected": -0.05903983116149902, "logps/chosen": -1.3446182012557983, "logps/rejected": -1.4061791896820068, "loss": 0.8627, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3446182012557983, "rewards/margins": 0.06156102567911148, "rewards/rejected": -1.4061791896820068, "sft_loss": 1.3568413257598877, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 7.0135353443302595, "learning_rate": 1.016042780748663e-06, "logits/chosen": -0.07751432806253433, "logits/rejected": 0.035579387098550797, "logps/chosen": -1.2608205080032349, "logps/rejected": -1.405822515487671, "loss": 0.809, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2608205080032349, "rewards/margins": 0.14500199258327484, "rewards/rejected": -1.405822515487671, "sft_loss": 1.288597583770752, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 4.414701651670081, "learning_rate": 1.0427807486631017e-06, "logits/chosen": 0.0497257336974144, "logits/rejected": 0.20516404509544373, "logps/chosen": -1.2470285892486572, "logps/rejected": -1.4114071130752563, "loss": 0.7964, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2470285892486572, "rewards/margins": 0.16437852382659912, "rewards/rejected": -1.4114071130752563, "sft_loss": 1.2925219535827637, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 14.857571461686588, "learning_rate": 1.0695187165775401e-06, "logits/chosen": -0.0697740688920021, "logits/rejected": 0.06182453781366348, "logps/chosen": -1.3620140552520752, "logps/rejected": -1.414411187171936, "loss": 0.8614, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3620140552520752, "rewards/margins": 0.05239716172218323, "rewards/rejected": -1.414411187171936, "sft_loss": 1.389324426651001, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 8.014046884380164, "learning_rate": 1.0962566844919785e-06, "logits/chosen": -0.04965759068727493, "logits/rejected": 0.08998225629329681, "logps/chosen": -1.2621620893478394, "logps/rejected": -1.3641141653060913, "loss": 0.8279, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2621620893478394, "rewards/margins": 0.10195207595825195, "rewards/rejected": -1.3641141653060913, "sft_loss": 1.2772701978683472, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 7.114198373387648, "learning_rate": 1.1229946524064172e-06, "logits/chosen": -0.13316112756729126, "logits/rejected": 0.047932691872119904, "logps/chosen": -1.3507158756256104, "logps/rejected": -1.4856475591659546, "loss": 0.8328, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3507158756256104, "rewards/margins": 0.13493159413337708, "rewards/rejected": -1.4856475591659546, "sft_loss": 1.339905023574829, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 6.438116798051675, "learning_rate": 1.1497326203208556e-06, "logits/chosen": -0.1692911684513092, "logits/rejected": 0.06463338434696198, "logps/chosen": -1.388016939163208, "logps/rejected": -1.4628031253814697, "loss": 0.8388, "rewards/accuracies": 0.5625, "rewards/chosen": -1.388016939163208, "rewards/margins": 0.07478625327348709, "rewards/rejected": -1.4628031253814697, "sft_loss": 1.367095947265625, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 13.122564944459192, "learning_rate": 1.1764705882352942e-06, "logits/chosen": 0.05476145073771477, "logits/rejected": 0.14929446578025818, "logps/chosen": -1.3221927881240845, "logps/rejected": -1.4936447143554688, "loss": 0.8145, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3221927881240845, "rewards/margins": 0.17145201563835144, "rewards/rejected": -1.4936447143554688, "sft_loss": 1.3450855016708374, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 4.810046075776873, "learning_rate": 1.2032085561497326e-06, "logits/chosen": -0.08290469646453857, "logits/rejected": 0.08052632957696915, "logps/chosen": -1.34523606300354, "logps/rejected": -1.4930176734924316, "loss": 0.8049, "rewards/accuracies": 0.5625, "rewards/chosen": -1.34523606300354, "rewards/margins": 0.1477815806865692, "rewards/rejected": -1.4930176734924316, "sft_loss": 1.3378939628601074, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 5.88697736522405, "learning_rate": 1.229946524064171e-06, "logits/chosen": 0.010171364061534405, "logits/rejected": 0.08339252322912216, "logps/chosen": -1.3613780736923218, "logps/rejected": -1.5585384368896484, "loss": 0.7995, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3613780736923218, "rewards/margins": 0.19716022908687592, "rewards/rejected": -1.5585384368896484, "sft_loss": 1.309051752090454, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 9.153755459677267, "learning_rate": 1.2566844919786097e-06, "logits/chosen": 0.040754929184913635, "logits/rejected": 0.17237837612628937, "logps/chosen": -1.3430492877960205, "logps/rejected": -1.5196940898895264, "loss": 0.7983, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3430492877960205, "rewards/margins": 0.17664454877376556, "rewards/rejected": -1.5196940898895264, "sft_loss": 1.3170509338378906, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 4.17884202482278, "learning_rate": 1.2834224598930481e-06, "logits/chosen": 0.014685697853565216, "logits/rejected": 0.13835473358631134, "logps/chosen": -1.3332364559173584, "logps/rejected": -1.5462366342544556, "loss": 0.8008, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3332364559173584, "rewards/margins": 0.21300017833709717, "rewards/rejected": -1.5462366342544556, "sft_loss": 1.3689817190170288, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 7.242285914484373, "learning_rate": 1.3101604278074866e-06, "logits/chosen": 0.03203669935464859, "logits/rejected": 0.1445348560810089, "logps/chosen": -1.4501844644546509, "logps/rejected": -1.5245463848114014, "loss": 0.8616, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4501844644546509, "rewards/margins": 0.0743618980050087, "rewards/rejected": -1.5245463848114014, "sft_loss": 1.4526886940002441, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 8.208482870825872, "learning_rate": 1.3368983957219252e-06, "logits/chosen": -0.05866866558790207, "logits/rejected": 0.09590274095535278, "logps/chosen": -1.342435359954834, "logps/rejected": -1.4371484518051147, "loss": 0.854, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.342435359954834, "rewards/margins": 0.0947132557630539, "rewards/rejected": -1.4371484518051147, "sft_loss": 1.3427950143814087, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 7.922536852476926, "learning_rate": 1.3636363636363636e-06, "logits/chosen": -0.007726128213107586, "logits/rejected": 0.1304907500743866, "logps/chosen": -1.3153059482574463, "logps/rejected": -1.4615005254745483, "loss": 0.8035, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3153059482574463, "rewards/margins": 0.14619456231594086, "rewards/rejected": -1.4615005254745483, "sft_loss": 1.2732080221176147, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 4.556168947130132, "learning_rate": 1.390374331550802e-06, "logits/chosen": -0.21302637457847595, "logits/rejected": -0.10784848779439926, "logps/chosen": -1.401681661605835, "logps/rejected": -1.5898513793945312, "loss": 0.7987, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.401681661605835, "rewards/margins": 0.18816980719566345, "rewards/rejected": -1.5898513793945312, "sft_loss": 1.4099761247634888, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 6.962383074856081, "learning_rate": 1.4171122994652407e-06, "logits/chosen": -0.06974764168262482, "logits/rejected": 0.011307650245726109, "logps/chosen": -1.3614174127578735, "logps/rejected": -1.5814176797866821, "loss": 0.8024, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3614174127578735, "rewards/margins": 0.2200002670288086, "rewards/rejected": -1.5814176797866821, "sft_loss": 1.4184319972991943, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 4.12622878270456, "learning_rate": 1.443850267379679e-06, "logits/chosen": -0.057353176176548004, "logits/rejected": 0.06626446545124054, "logps/chosen": -1.3444592952728271, "logps/rejected": -1.4615471363067627, "loss": 0.821, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3444592952728271, "rewards/margins": 0.11708767712116241, "rewards/rejected": -1.4615471363067627, "sft_loss": 1.3590993881225586, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 4.861276699079042, "learning_rate": 1.4705882352941175e-06, "logits/chosen": -0.013329749926924706, "logits/rejected": 0.08376727253198624, "logps/chosen": -1.285291075706482, "logps/rejected": -1.5019479990005493, "loss": 0.7831, "rewards/accuracies": 0.625, "rewards/chosen": -1.285291075706482, "rewards/margins": 0.21665680408477783, "rewards/rejected": -1.5019479990005493, "sft_loss": 1.275146245956421, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 8.273378567734328, "learning_rate": 1.4973262032085562e-06, "logits/chosen": -0.086963951587677, "logits/rejected": 0.061237942427396774, "logps/chosen": -1.3376895189285278, "logps/rejected": -1.4818319082260132, "loss": 0.8121, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3376895189285278, "rewards/margins": 0.14414247870445251, "rewards/rejected": -1.4818319082260132, "sft_loss": 1.3308953046798706, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 6.401594954449087, "learning_rate": 1.5240641711229948e-06, "logits/chosen": -0.03835974261164665, "logits/rejected": 0.10481540858745575, "logps/chosen": -1.3693543672561646, "logps/rejected": -1.4908616542816162, "loss": 0.8414, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3693543672561646, "rewards/margins": 0.12150740623474121, "rewards/rejected": -1.4908616542816162, "sft_loss": 1.4108846187591553, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 6.691312802522371, "learning_rate": 1.5508021390374332e-06, "logits/chosen": -0.10891927778720856, "logits/rejected": 0.17821714282035828, "logps/chosen": -1.3916670083999634, "logps/rejected": -1.5492541790008545, "loss": 0.8147, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3916670083999634, "rewards/margins": 0.15758727490901947, "rewards/rejected": -1.5492541790008545, "sft_loss": 1.3846977949142456, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 6.145719512127213, "learning_rate": 1.5775401069518718e-06, "logits/chosen": -0.04946039989590645, "logits/rejected": 0.005695834755897522, "logps/chosen": -1.2853538990020752, "logps/rejected": -1.4636025428771973, "loss": 0.7909, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2853538990020752, "rewards/margins": 0.17824865877628326, "rewards/rejected": -1.4636025428771973, "sft_loss": 1.2956326007843018, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 6.108065113429084, "learning_rate": 1.6042780748663103e-06, "logits/chosen": -0.06966052949428558, "logits/rejected": 0.09140712022781372, "logps/chosen": -1.3252753019332886, "logps/rejected": -1.4613568782806396, "loss": 0.8185, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3252753019332886, "rewards/margins": 0.13608156144618988, "rewards/rejected": -1.4613568782806396, "sft_loss": 1.38861882686615, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 4.628431738835329, "learning_rate": 1.6310160427807487e-06, "logits/chosen": -0.016732195392251015, "logits/rejected": 0.05728600174188614, "logps/chosen": -1.4442397356033325, "logps/rejected": -1.4708423614501953, "loss": 0.8895, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.4442397356033325, "rewards/margins": 0.026602596044540405, "rewards/rejected": -1.4708423614501953, "sft_loss": 1.4393965005874634, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 6.572080956668939, "learning_rate": 1.6577540106951873e-06, "logits/chosen": -0.20191077888011932, "logits/rejected": -0.11043348163366318, "logps/chosen": -1.400591492652893, "logps/rejected": -1.5339974164962769, "loss": 0.8448, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.400591492652893, "rewards/margins": 0.13340599834918976, "rewards/rejected": -1.5339974164962769, "sft_loss": 1.3988441228866577, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 7.112907967871754, "learning_rate": 1.6844919786096258e-06, "logits/chosen": -0.012886536307632923, "logits/rejected": 0.13783465325832367, "logps/chosen": -1.4094035625457764, "logps/rejected": -1.6090682744979858, "loss": 0.829, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4094035625457764, "rewards/margins": 0.1996646225452423, "rewards/rejected": -1.6090682744979858, "sft_loss": 1.4167879819869995, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 5.460820198591911, "learning_rate": 1.7112299465240644e-06, "logits/chosen": -0.07375101000070572, "logits/rejected": 0.057667456567287445, "logps/chosen": -1.3837597370147705, "logps/rejected": -1.4656795263290405, "loss": 0.8424, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3837597370147705, "rewards/margins": 0.08191985636949539, "rewards/rejected": -1.4656795263290405, "sft_loss": 1.3913055658340454, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 6.045630781542912, "learning_rate": 1.7379679144385028e-06, "logits/chosen": -0.142831951379776, "logits/rejected": -0.027666250243782997, "logps/chosen": -1.392391324043274, "logps/rejected": -1.7545061111450195, "loss": 0.7685, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.392391324043274, "rewards/margins": 0.36211466789245605, "rewards/rejected": -1.7545061111450195, "sft_loss": 1.4733895063400269, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 6.701291122816339, "learning_rate": 1.7647058823529412e-06, "logits/chosen": -0.015426212921738625, "logits/rejected": 0.12235834449529648, "logps/chosen": -1.4288122653961182, "logps/rejected": -1.6852651834487915, "loss": 0.7874, "rewards/accuracies": 0.625, "rewards/chosen": -1.4288122653961182, "rewards/margins": 0.2564530670642853, "rewards/rejected": -1.6852651834487915, "sft_loss": 1.4210307598114014, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 9.467335957114459, "learning_rate": 1.7914438502673799e-06, "logits/chosen": 0.03193669766187668, "logits/rejected": 0.13159868121147156, "logps/chosen": -1.4515048265457153, "logps/rejected": -1.5521752834320068, "loss": 0.8385, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.4515048265457153, "rewards/margins": 0.10067038238048553, "rewards/rejected": -1.5521752834320068, "sft_loss": 1.4200032949447632, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 18.183367879876617, "learning_rate": 1.8181818181818183e-06, "logits/chosen": -0.04067561402916908, "logits/rejected": 0.09822919219732285, "logps/chosen": -1.5396358966827393, "logps/rejected": -1.7132318019866943, "loss": 0.8609, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5396358966827393, "rewards/margins": 0.17359602451324463, "rewards/rejected": -1.7132318019866943, "sft_loss": 1.483871579170227, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 4.864162372810918, "learning_rate": 1.8449197860962567e-06, "logits/chosen": 0.03292379528284073, "logits/rejected": 0.0665588527917862, "logps/chosen": -1.3988286256790161, "logps/rejected": -1.6309274435043335, "loss": 0.7955, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3988286256790161, "rewards/margins": 0.23209872841835022, "rewards/rejected": -1.6309274435043335, "sft_loss": 1.4277395009994507, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 5.878593972427354, "learning_rate": 1.8716577540106954e-06, "logits/chosen": -0.009993275627493858, "logits/rejected": 0.07647766172885895, "logps/chosen": -1.339165210723877, "logps/rejected": -1.51967191696167, "loss": 0.818, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.339165210723877, "rewards/margins": 0.18050655722618103, "rewards/rejected": -1.51967191696167, "sft_loss": 1.3909133672714233, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 6.8717869858531975, "learning_rate": 1.8983957219251338e-06, "logits/chosen": -0.0906287282705307, "logits/rejected": 0.11716796457767487, "logps/chosen": -1.4287830591201782, "logps/rejected": -1.5410370826721191, "loss": 0.8524, "rewards/accuracies": 0.53125, "rewards/chosen": -1.4287830591201782, "rewards/margins": 0.11225400120019913, "rewards/rejected": -1.5410370826721191, "sft_loss": 1.4500958919525146, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 5.933343645442397, "learning_rate": 1.9251336898395724e-06, "logits/chosen": -0.050111234188079834, "logits/rejected": 0.031887516379356384, "logps/chosen": -1.3715035915374756, "logps/rejected": -1.5597730875015259, "loss": 0.8122, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3715035915374756, "rewards/margins": 0.1882694959640503, "rewards/rejected": -1.5597730875015259, "sft_loss": 1.3476603031158447, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 7.354723918458283, "learning_rate": 1.951871657754011e-06, "logits/chosen": 0.03143525868654251, "logits/rejected": 0.10601303726434708, "logps/chosen": -1.358642339706421, "logps/rejected": -1.5123530626296997, "loss": 0.8255, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.358642339706421, "rewards/margins": 0.15371054410934448, "rewards/rejected": -1.5123530626296997, "sft_loss": 1.3351366519927979, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 5.571755992115051, "learning_rate": 1.9786096256684493e-06, "logits/chosen": 0.012722733430564404, "logits/rejected": 0.10783351957798004, "logps/chosen": -1.3447506427764893, "logps/rejected": -1.456059217453003, "loss": 0.8237, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3447506427764893, "rewards/margins": 0.11130844056606293, "rewards/rejected": -1.456059217453003, "sft_loss": 1.3272932767868042, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 6.290620308688981, "learning_rate": 2.0053475935828877e-06, "logits/chosen": -0.052033692598342896, "logits/rejected": 0.10267876088619232, "logps/chosen": -1.3179277181625366, "logps/rejected": -1.5607457160949707, "loss": 0.7856, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3179277181625366, "rewards/margins": 0.24281811714172363, "rewards/rejected": -1.5607457160949707, "sft_loss": 1.3819175958633423, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 7.164507205084144, "learning_rate": 2.032085561497326e-06, "logits/chosen": -0.017917213961482048, "logits/rejected": 0.06469281017780304, "logps/chosen": -1.3857579231262207, "logps/rejected": -1.5951461791992188, "loss": 0.7983, "rewards/accuracies": 0.625, "rewards/chosen": -1.3857579231262207, "rewards/margins": 0.20938809216022491, "rewards/rejected": -1.5951461791992188, "sft_loss": 1.389345407485962, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 4.805975521600569, "learning_rate": 2.058823529411765e-06, "logits/chosen": 0.04141540080308914, "logits/rejected": 0.1269879788160324, "logps/chosen": -1.487316608428955, "logps/rejected": -1.549757719039917, "loss": 0.8833, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.487316608428955, "rewards/margins": 0.062441110610961914, "rewards/rejected": -1.549757719039917, "sft_loss": 1.4863076210021973, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 7.329243789675164, "learning_rate": 2.0855614973262034e-06, "logits/chosen": 0.08908367156982422, "logits/rejected": 0.2541596293449402, "logps/chosen": -1.49787437915802, "logps/rejected": -1.6316359043121338, "loss": 0.8545, "rewards/accuracies": 0.5, "rewards/chosen": -1.49787437915802, "rewards/margins": 0.1337617188692093, "rewards/rejected": -1.6316359043121338, "sft_loss": 1.4867982864379883, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 6.757557326800351, "learning_rate": 2.112299465240642e-06, "logits/chosen": -0.006264881696552038, "logits/rejected": 0.16049674153327942, "logps/chosen": -1.4501184225082397, "logps/rejected": -1.5559766292572021, "loss": 0.8322, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4501184225082397, "rewards/margins": 0.10585812479257584, "rewards/rejected": -1.5559766292572021, "sft_loss": 1.4444706439971924, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 7.791777236601081, "learning_rate": 2.1390374331550802e-06, "logits/chosen": 0.08647340536117554, "logits/rejected": 0.17804650962352753, "logps/chosen": -1.4405090808868408, "logps/rejected": -1.6294504404067993, "loss": 0.7978, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4405090808868408, "rewards/margins": 0.18894155323505402, "rewards/rejected": -1.6294504404067993, "sft_loss": 1.4118216037750244, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": 0.2666130065917969, "eval_logits/rejected": 0.3544902503490448, "eval_logps/chosen": -1.4694241285324097, "eval_logps/rejected": -1.7231982946395874, "eval_loss": 0.7942562699317932, "eval_rewards/accuracies": 0.6045994162559509, "eval_rewards/chosen": -1.4694241285324097, "eval_rewards/margins": 0.25377434492111206, "eval_rewards/rejected": -1.7231982946395874, "eval_runtime": 45.6824, "eval_samples_per_second": 29.442, "eval_sft_loss": 1.4613449573516846, "eval_steps_per_second": 7.377, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 7.623090780501526, "learning_rate": 2.1657754010695186e-06, "logits/chosen": -0.006093514151871204, "logits/rejected": 0.08722630143165588, "logps/chosen": -1.4930610656738281, "logps/rejected": -1.7053945064544678, "loss": 0.8255, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4930610656738281, "rewards/margins": 0.2123335301876068, "rewards/rejected": -1.7053945064544678, "sft_loss": 1.4521852731704712, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 6.970243240124782, "learning_rate": 2.192513368983957e-06, "logits/chosen": 0.0605601966381073, "logits/rejected": 0.1913018524646759, "logps/chosen": -1.4390863180160522, "logps/rejected": -1.6598100662231445, "loss": 0.8031, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4390863180160522, "rewards/margins": 0.2207239419221878, "rewards/rejected": -1.6598100662231445, "sft_loss": 1.4593582153320312, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 5.0092673817044595, "learning_rate": 2.219251336898396e-06, "logits/chosen": 0.055453162640333176, "logits/rejected": 0.09989786893129349, "logps/chosen": -1.469822883605957, "logps/rejected": -1.7281382083892822, "loss": 0.7949, "rewards/accuracies": 0.5625, "rewards/chosen": -1.469822883605957, "rewards/margins": 0.2583152651786804, "rewards/rejected": -1.7281382083892822, "sft_loss": 1.4433677196502686, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 6.690456611021986, "learning_rate": 2.2459893048128343e-06, "logits/chosen": 0.027899065986275673, "logits/rejected": 0.23041871190071106, "logps/chosen": -1.4033445119857788, "logps/rejected": -1.6399847269058228, "loss": 0.8002, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4033445119857788, "rewards/margins": 0.23664002120494843, "rewards/rejected": -1.6399847269058228, "sft_loss": 1.4191946983337402, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 6.857911690897056, "learning_rate": 2.2727272727272728e-06, "logits/chosen": 0.0018939822912216187, "logits/rejected": 0.20424532890319824, "logps/chosen": -1.4903066158294678, "logps/rejected": -1.8016868829727173, "loss": 0.785, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4903066158294678, "rewards/margins": 0.31138020753860474, "rewards/rejected": -1.8016868829727173, "sft_loss": 1.5297497510910034, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 7.467088529413112, "learning_rate": 2.299465240641711e-06, "logits/chosen": -0.03759707883000374, "logits/rejected": 0.16821780800819397, "logps/chosen": -1.475658655166626, "logps/rejected": -1.8194904327392578, "loss": 0.782, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.475658655166626, "rewards/margins": 0.34383195638656616, "rewards/rejected": -1.8194904327392578, "sft_loss": 1.495807409286499, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 8.391376519531686, "learning_rate": 2.3262032085561496e-06, "logits/chosen": 0.03897671774029732, "logits/rejected": 0.1309877336025238, "logps/chosen": -1.4078749418258667, "logps/rejected": -1.6813606023788452, "loss": 0.7738, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4078749418258667, "rewards/margins": 0.27348554134368896, "rewards/rejected": -1.6813606023788452, "sft_loss": 1.4570095539093018, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 6.652392161153766, "learning_rate": 2.3529411764705885e-06, "logits/chosen": 0.03420887142419815, "logits/rejected": 0.13482454419136047, "logps/chosen": -1.5327328443527222, "logps/rejected": -1.7989847660064697, "loss": 0.8044, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5327328443527222, "rewards/margins": 0.26625189185142517, "rewards/rejected": -1.7989847660064697, "sft_loss": 1.502314805984497, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 13.064384890919003, "learning_rate": 2.379679144385027e-06, "logits/chosen": -0.004420773591846228, "logits/rejected": 0.11586644500494003, "logps/chosen": -1.6088615655899048, "logps/rejected": -2.0403130054473877, "loss": 0.7994, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6088615655899048, "rewards/margins": 0.43145138025283813, "rewards/rejected": -2.0403130054473877, "sft_loss": 1.5593528747558594, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 10.216812585799813, "learning_rate": 2.4064171122994653e-06, "logits/chosen": 0.03817467391490936, "logits/rejected": 0.16011175513267517, "logps/chosen": -1.5901763439178467, "logps/rejected": -1.9021117687225342, "loss": 0.7844, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5901763439178467, "rewards/margins": 0.31193557381629944, "rewards/rejected": -1.9021117687225342, "sft_loss": 1.5255615711212158, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 7.038546807134959, "learning_rate": 2.4331550802139037e-06, "logits/chosen": -0.0005314975860528648, "logits/rejected": 0.08741128444671631, "logps/chosen": -1.4183223247528076, "logps/rejected": -1.8327878713607788, "loss": 0.7654, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4183223247528076, "rewards/margins": 0.4144655764102936, "rewards/rejected": -1.8327878713607788, "sft_loss": 1.4563426971435547, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 5.275609189536791, "learning_rate": 2.459893048128342e-06, "logits/chosen": -0.1312110722064972, "logits/rejected": -0.017421646043658257, "logps/chosen": -1.5381498336791992, "logps/rejected": -1.7059634923934937, "loss": 0.8502, "rewards/accuracies": 0.53125, "rewards/chosen": -1.5381498336791992, "rewards/margins": 0.16781362891197205, "rewards/rejected": -1.7059634923934937, "sft_loss": 1.5450729131698608, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 6.041385703241205, "learning_rate": 2.4866310160427806e-06, "logits/chosen": 0.10603974759578705, "logits/rejected": 0.12657591700553894, "logps/chosen": -1.4376887083053589, "logps/rejected": -1.6697343587875366, "loss": 0.8092, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4376887083053589, "rewards/margins": 0.2320457249879837, "rewards/rejected": -1.6697343587875366, "sft_loss": 1.4443788528442383, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 6.11710821245787, "learning_rate": 2.5133689839572194e-06, "logits/chosen": 0.14308522641658783, "logits/rejected": 0.0967307984828949, "logps/chosen": -1.394923448562622, "logps/rejected": -1.6780554056167603, "loss": 0.7755, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.394923448562622, "rewards/margins": 0.2831319272518158, "rewards/rejected": -1.6780554056167603, "sft_loss": 1.418574571609497, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 6.198722703692081, "learning_rate": 2.540106951871658e-06, "logits/chosen": -0.09760783612728119, "logits/rejected": 0.01914280094206333, "logps/chosen": -1.4242517948150635, "logps/rejected": -1.8115177154541016, "loss": 0.7662, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4242517948150635, "rewards/margins": 0.3872661590576172, "rewards/rejected": -1.8115177154541016, "sft_loss": 1.4599872827529907, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 6.672155844526494, "learning_rate": 2.5668449197860963e-06, "logits/chosen": -0.088392473757267, "logits/rejected": 0.1002928838133812, "logps/chosen": -1.3997070789337158, "logps/rejected": -1.6192430257797241, "loss": 0.799, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3997070789337158, "rewards/margins": 0.21953590214252472, "rewards/rejected": -1.6192430257797241, "sft_loss": 1.4144244194030762, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 7.2195539216686075, "learning_rate": 2.5935828877005347e-06, "logits/chosen": -0.06334365159273148, "logits/rejected": -0.02100563421845436, "logps/chosen": -1.516610860824585, "logps/rejected": -1.7579715251922607, "loss": 0.8013, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.516610860824585, "rewards/margins": 0.24136073887348175, "rewards/rejected": -1.7579715251922607, "sft_loss": 1.4901747703552246, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 12.734882055380117, "learning_rate": 2.620320855614973e-06, "logits/chosen": -0.03537796065211296, "logits/rejected": 0.03429726883769035, "logps/chosen": -1.5109494924545288, "logps/rejected": -1.7095212936401367, "loss": 0.8352, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5109494924545288, "rewards/margins": 0.19857171177864075, "rewards/rejected": -1.7095212936401367, "sft_loss": 1.469315528869629, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 7.778351147647691, "learning_rate": 2.647058823529412e-06, "logits/chosen": -0.10972901433706284, "logits/rejected": -0.08436641842126846, "logps/chosen": -1.5129988193511963, "logps/rejected": -1.6879619359970093, "loss": 0.8408, "rewards/accuracies": 0.53125, "rewards/chosen": -1.5129988193511963, "rewards/margins": 0.17496302723884583, "rewards/rejected": -1.6879619359970093, "sft_loss": 1.5606034994125366, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 5.42051394350465, "learning_rate": 2.6737967914438504e-06, "logits/chosen": -0.1078430786728859, "logits/rejected": -0.01456520240753889, "logps/chosen": -1.383042812347412, "logps/rejected": -1.6418497562408447, "loss": 0.7912, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.383042812347412, "rewards/margins": 0.25880688428878784, "rewards/rejected": -1.6418497562408447, "sft_loss": 1.4060033559799194, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 7.468246318755707, "learning_rate": 2.700534759358289e-06, "logits/chosen": -0.10797281563282013, "logits/rejected": 0.03077404573559761, "logps/chosen": -1.4886077642440796, "logps/rejected": -1.6428436040878296, "loss": 0.8322, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4886077642440796, "rewards/margins": 0.15423579514026642, "rewards/rejected": -1.6428436040878296, "sft_loss": 1.4929144382476807, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 5.792355588006178, "learning_rate": 2.7272727272727272e-06, "logits/chosen": 0.043688975274562836, "logits/rejected": 0.10481522977352142, "logps/chosen": -1.439705491065979, "logps/rejected": -1.7611348628997803, "loss": 0.7658, "rewards/accuracies": 0.625, "rewards/chosen": -1.439705491065979, "rewards/margins": 0.32142946124076843, "rewards/rejected": -1.7611348628997803, "sft_loss": 1.4002330303192139, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 5.017281691975952, "learning_rate": 2.7540106951871656e-06, "logits/chosen": 0.0008904725546017289, "logits/rejected": 0.09226072579622269, "logps/chosen": -1.3720366954803467, "logps/rejected": -1.6341902017593384, "loss": 0.7902, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3720366954803467, "rewards/margins": 0.262153297662735, "rewards/rejected": -1.6341902017593384, "sft_loss": 1.401508092880249, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 5.805813893255752, "learning_rate": 2.780748663101604e-06, "logits/chosen": -0.13670828938484192, "logits/rejected": -0.0036086924374103546, "logps/chosen": -1.4570739269256592, "logps/rejected": -1.7094089984893799, "loss": 0.8105, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4570739269256592, "rewards/margins": 0.2523351311683655, "rewards/rejected": -1.7094089984893799, "sft_loss": 1.5445858240127563, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 13.47963552520056, "learning_rate": 2.807486631016043e-06, "logits/chosen": 0.04953201487660408, "logits/rejected": 0.11939598619937897, "logps/chosen": -1.4688103199005127, "logps/rejected": -1.7682476043701172, "loss": 0.8014, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4688103199005127, "rewards/margins": 0.29943740367889404, "rewards/rejected": -1.7682476043701172, "sft_loss": 1.5212879180908203, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 6.1329588367259005, "learning_rate": 2.8342245989304813e-06, "logits/chosen": -0.01726909913122654, "logits/rejected": 0.06347990781068802, "logps/chosen": -1.4241316318511963, "logps/rejected": -1.7015587091445923, "loss": 0.7945, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4241316318511963, "rewards/margins": 0.2774270474910736, "rewards/rejected": -1.7015587091445923, "sft_loss": 1.3766034841537476, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 5.483486136121046, "learning_rate": 2.8609625668449198e-06, "logits/chosen": -0.18864202499389648, "logits/rejected": 0.040623150765895844, "logps/chosen": -1.4134786128997803, "logps/rejected": -1.638918161392212, "loss": 0.7908, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4134786128997803, "rewards/margins": 0.22543945908546448, "rewards/rejected": -1.638918161392212, "sft_loss": 1.3824502229690552, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 8.233006873244952, "learning_rate": 2.887700534759358e-06, "logits/chosen": -0.08203691244125366, "logits/rejected": -0.01693258062005043, "logps/chosen": -1.5680696964263916, "logps/rejected": -1.7938575744628906, "loss": 0.8159, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5680696964263916, "rewards/margins": 0.22578780353069305, "rewards/rejected": -1.7938575744628906, "sft_loss": 1.5657321214675903, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 5.350671086008654, "learning_rate": 2.9144385026737966e-06, "logits/chosen": -0.19635380804538727, "logits/rejected": -0.014688762836158276, "logps/chosen": -1.4838629961013794, "logps/rejected": -1.8379631042480469, "loss": 0.7691, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4838629961013794, "rewards/margins": 0.35410040616989136, "rewards/rejected": -1.8379631042480469, "sft_loss": 1.482094407081604, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 5.3327033465197236, "learning_rate": 2.941176470588235e-06, "logits/chosen": -0.07292584329843521, "logits/rejected": -0.01656881347298622, "logps/chosen": -1.5360199213027954, "logps/rejected": -1.8137409687042236, "loss": 0.7828, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5360199213027954, "rewards/margins": 0.2777208983898163, "rewards/rejected": -1.8137409687042236, "sft_loss": 1.479871153831482, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 8.691037453690216, "learning_rate": 2.967914438502674e-06, "logits/chosen": -0.18928229808807373, "logits/rejected": -0.08067715167999268, "logps/chosen": -1.572627067565918, "logps/rejected": -1.7626476287841797, "loss": 0.8448, "rewards/accuracies": 0.5625, "rewards/chosen": -1.572627067565918, "rewards/margins": 0.19002054631710052, "rewards/rejected": -1.7626476287841797, "sft_loss": 1.5602760314941406, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 4.949138893882736, "learning_rate": 2.9946524064171123e-06, "logits/chosen": -0.03637278825044632, "logits/rejected": -0.019715752452611923, "logps/chosen": -1.4272973537445068, "logps/rejected": -1.723838210105896, "loss": 0.7948, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4272973537445068, "rewards/margins": 0.29654109477996826, "rewards/rejected": -1.723838210105896, "sft_loss": 1.5645406246185303, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 4.689383987788954, "learning_rate": 2.999995343036539e-06, "logits/chosen": -0.038937196135520935, "logits/rejected": 0.014615943655371666, "logps/chosen": -1.4650460481643677, "logps/rejected": -1.75836181640625, "loss": 0.7855, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4650460481643677, "rewards/margins": 0.29331594705581665, "rewards/rejected": -1.75836181640625, "sft_loss": 1.5022021532058716, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 8.336253133816138, "learning_rate": 2.9999764241720397e-06, "logits/chosen": -0.14228703081607819, "logits/rejected": 0.06473597139120102, "logps/chosen": -1.4488310813903809, "logps/rejected": -1.7192370891571045, "loss": 0.7972, "rewards/accuracies": 0.625, "rewards/chosen": -1.4488310813903809, "rewards/margins": 0.27040624618530273, "rewards/rejected": -1.7192370891571045, "sft_loss": 1.5263842344284058, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 5.718887405179139, "learning_rate": 2.9999429525296936e-06, "logits/chosen": -0.12334390729665756, "logits/rejected": -0.06649667024612427, "logps/chosen": -1.4215309619903564, "logps/rejected": -1.7182035446166992, "loss": 0.7879, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4215309619903564, "rewards/margins": 0.2966725826263428, "rewards/rejected": -1.7182035446166992, "sft_loss": 1.4421032667160034, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 5.482762190871826, "learning_rate": 2.9998949284342434e-06, "logits/chosen": -0.12731342017650604, "logits/rejected": 0.018348468467593193, "logps/chosen": -1.4773890972137451, "logps/rejected": -1.9029382467269897, "loss": 0.7485, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4773890972137451, "rewards/margins": 0.4255490303039551, "rewards/rejected": -1.9029382467269897, "sft_loss": 1.5035072565078735, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 7.504671278923474, "learning_rate": 2.99983235235162e-06, "logits/chosen": -0.19935433566570282, "logits/rejected": -0.0978468805551529, "logps/chosen": -1.7026008367538452, "logps/rejected": -1.903778314590454, "loss": 0.8722, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.7026008367538452, "rewards/margins": 0.20117728412151337, "rewards/rejected": -1.903778314590454, "sft_loss": 1.6554231643676758, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 8.419389774679347, "learning_rate": 2.999755224888935e-06, "logits/chosen": -0.15665586292743683, "logits/rejected": -0.043528031557798386, "logps/chosen": -1.57762610912323, "logps/rejected": -1.7399566173553467, "loss": 0.8459, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.57762610912323, "rewards/margins": 0.16233067214488983, "rewards/rejected": -1.7399566173553467, "sft_loss": 1.5726512670516968, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 6.543602385073035, "learning_rate": 2.9996635467944813e-06, "logits/chosen": -0.06690023839473724, "logits/rejected": 0.0520065613090992, "logps/chosen": -1.449690818786621, "logps/rejected": -1.726157784461975, "loss": 0.7931, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.449690818786621, "rewards/margins": 0.2764667570590973, "rewards/rejected": -1.726157784461975, "sft_loss": 1.4725532531738281, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 5.818392123470136, "learning_rate": 2.999557318957719e-06, "logits/chosen": -0.13861538469791412, "logits/rejected": 0.00308010121807456, "logps/chosen": -1.4782884120941162, "logps/rejected": -1.6739435195922852, "loss": 0.8127, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4782884120941162, "rewards/margins": 0.19565510749816895, "rewards/rejected": -1.6739435195922852, "sft_loss": 1.4908336400985718, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 8.237093368806667, "learning_rate": 2.9994365424092717e-06, "logits/chosen": -0.1953982412815094, "logits/rejected": -0.11716214567422867, "logps/chosen": -1.5879408121109009, "logps/rejected": -1.8976964950561523, "loss": 0.8077, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5879408121109009, "rewards/margins": 0.3097555637359619, "rewards/rejected": -1.8976964950561523, "sft_loss": 1.5898962020874023, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 12.66322925163889, "learning_rate": 2.9993012183209135e-06, "logits/chosen": -0.07549289613962173, "logits/rejected": 0.07459872961044312, "logps/chosen": -1.5181553363800049, "logps/rejected": -1.8104722499847412, "loss": 0.8062, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5181553363800049, "rewards/margins": 0.2923170328140259, "rewards/rejected": -1.8104722499847412, "sft_loss": 1.50444757938385, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 6.40661361783137, "learning_rate": 2.9991513480055592e-06, "logits/chosen": -0.16410446166992188, "logits/rejected": -0.05960635468363762, "logps/chosen": -1.53480863571167, "logps/rejected": -1.9423625469207764, "loss": 0.7633, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.53480863571167, "rewards/margins": 0.4075539708137512, "rewards/rejected": -1.9423625469207764, "sft_loss": 1.5325727462768555, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 5.863570720455386, "learning_rate": 2.998986932917252e-06, "logits/chosen": -0.04310298711061478, "logits/rejected": 0.019075483083724976, "logps/chosen": -1.607175588607788, "logps/rejected": -1.897870659828186, "loss": 0.808, "rewards/accuracies": 0.625, "rewards/chosen": -1.607175588607788, "rewards/margins": 0.2906948924064636, "rewards/rejected": -1.897870659828186, "sft_loss": 1.605529546737671, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 7.59931372209281, "learning_rate": 2.998807974651147e-06, "logits/chosen": -0.0006595879676751792, "logits/rejected": 0.11075425148010254, "logps/chosen": -1.487177848815918, "logps/rejected": -1.888089895248413, "loss": 0.7655, "rewards/accuracies": 0.625, "rewards/chosen": -1.487177848815918, "rewards/margins": 0.4009120464324951, "rewards/rejected": -1.888089895248413, "sft_loss": 1.5196160078048706, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 10.94150119403967, "learning_rate": 2.9986144749434987e-06, "logits/chosen": -0.06883852183818817, "logits/rejected": 0.04102737456560135, "logps/chosen": -1.5510679483413696, "logps/rejected": -1.9344244003295898, "loss": 0.7503, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5510679483413696, "rewards/margins": 0.38335639238357544, "rewards/rejected": -1.9344244003295898, "sft_loss": 1.5082122087478638, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 5.977494774665963, "learning_rate": 2.9984064356716413e-06, "logits/chosen": -0.05886172130703926, "logits/rejected": 0.18332821130752563, "logps/chosen": -1.6331151723861694, "logps/rejected": -1.9419275522232056, "loss": 0.8171, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6331151723861694, "rewards/margins": 0.30881232023239136, "rewards/rejected": -1.9419275522232056, "sft_loss": 1.6244218349456787, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 15.23682265863437, "learning_rate": 2.998183858853974e-06, "logits/chosen": -0.16301007568836212, "logits/rejected": 0.0349007211625576, "logps/chosen": -1.6005462408065796, "logps/rejected": -1.9187949895858765, "loss": 0.8161, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.6005462408065796, "rewards/margins": 0.31824877858161926, "rewards/rejected": -1.9187949895858765, "sft_loss": 1.648459792137146, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 6.423514529899254, "learning_rate": 2.997946746649937e-06, "logits/chosen": -0.12605305016040802, "logits/rejected": -0.04022118076682091, "logps/chosen": -1.4993903636932373, "logps/rejected": -1.9236873388290405, "loss": 0.749, "rewards/accuracies": 0.625, "rewards/chosen": -1.4993903636932373, "rewards/margins": 0.4242970943450928, "rewards/rejected": -1.9236873388290405, "sft_loss": 1.4816380739212036, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 8.148623275430047, "learning_rate": 2.997695101359994e-06, "logits/chosen": -0.10559117794036865, "logits/rejected": 0.036014266312122345, "logps/chosen": -1.6362364292144775, "logps/rejected": -1.9686295986175537, "loss": 0.7973, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6362364292144775, "rewards/margins": 0.3323933184146881, "rewards/rejected": -1.9686295986175537, "sft_loss": 1.6498661041259766, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 12.37187154833635, "learning_rate": 2.997428925425609e-06, "logits/chosen": -0.03357526287436485, "logits/rejected": -0.016637753695249557, "logps/chosen": -1.5325089693069458, "logps/rejected": -1.891472578048706, "loss": 0.8019, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5325089693069458, "rewards/margins": 0.3589635193347931, "rewards/rejected": -1.891472578048706, "sft_loss": 1.5624765157699585, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 7.119224348231651, "learning_rate": 2.997148221429223e-06, "logits/chosen": -0.044113095849752426, "logits/rejected": 0.07033444941043854, "logps/chosen": -1.4734444618225098, "logps/rejected": -1.6820869445800781, "loss": 0.8191, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4734444618225098, "rewards/margins": 0.20864248275756836, "rewards/rejected": -1.6820869445800781, "sft_loss": 1.510671854019165, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 7.052877336685435, "learning_rate": 2.996852992094225e-06, "logits/chosen": -0.07160480320453644, "logits/rejected": 0.055411409586668015, "logps/chosen": -1.4131146669387817, "logps/rejected": -1.682700514793396, "loss": 0.7936, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4131146669387817, "rewards/margins": 0.2695859670639038, "rewards/rejected": -1.682700514793396, "sft_loss": 1.4746235609054565, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 4.749428938217011, "learning_rate": 2.9965432402849336e-06, "logits/chosen": -0.09310106933116913, "logits/rejected": 0.09594430029392242, "logps/chosen": -1.4437072277069092, "logps/rejected": -1.6759834289550781, "loss": 0.7985, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4437072277069092, "rewards/margins": 0.23227599263191223, "rewards/rejected": -1.6759834289550781, "sft_loss": 1.5473780632019043, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 5.946458004286902, "learning_rate": 2.9962189690065614e-06, "logits/chosen": -0.0892430767416954, "logits/rejected": -0.020457569509744644, "logps/chosen": -1.4541094303131104, "logps/rejected": -1.8192046880722046, "loss": 0.7613, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4541094303131104, "rewards/margins": 0.3650952875614166, "rewards/rejected": -1.8192046880722046, "sft_loss": 1.5087134838104248, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 5.708891942201608, "learning_rate": 2.99588018140519e-06, "logits/chosen": -0.013011714443564415, "logits/rejected": 0.1486460268497467, "logps/chosen": -1.5294251441955566, "logps/rejected": -1.7828242778778076, "loss": 0.855, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.5294251441955566, "rewards/margins": 0.2533993124961853, "rewards/rejected": -1.7828242778778076, "sft_loss": 1.5098159313201904, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 10.640059246416705, "learning_rate": 2.995526880767737e-06, "logits/chosen": -0.04731638729572296, "logits/rejected": 0.10229668766260147, "logps/chosen": -1.5006544589996338, "logps/rejected": -1.7863868474960327, "loss": 0.8041, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5006544589996338, "rewards/margins": 0.28573232889175415, "rewards/rejected": -1.7863868474960327, "sft_loss": 1.506460428237915, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 7.265291049700372, "learning_rate": 2.9951590705219287e-06, "logits/chosen": -0.07539691030979156, "logits/rejected": -0.03621528297662735, "logps/chosen": -1.5116404294967651, "logps/rejected": -1.7689307928085327, "loss": 0.8229, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5116404294967651, "rewards/margins": 0.257290244102478, "rewards/rejected": -1.7689307928085327, "sft_loss": 1.5742204189300537, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 7.374285279419172, "learning_rate": 2.99477675423626e-06, "logits/chosen": -0.10970363765954971, "logits/rejected": -0.021756969392299652, "logps/chosen": -1.4470633268356323, "logps/rejected": -1.776341199874878, "loss": 0.7694, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4470633268356323, "rewards/margins": 0.3292779326438904, "rewards/rejected": -1.776341199874878, "sft_loss": 1.4733977317810059, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 8.942277187244555, "learning_rate": 2.994379935619966e-06, "logits/chosen": -0.21716026961803436, "logits/rejected": -0.07959534227848053, "logps/chosen": -1.6371448040008545, "logps/rejected": -1.8145755529403687, "loss": 0.8325, "rewards/accuracies": 0.625, "rewards/chosen": -1.6371448040008545, "rewards/margins": 0.17743055522441864, "rewards/rejected": -1.8145755529403687, "sft_loss": 1.5957849025726318, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 5.15267596889546, "learning_rate": 2.9939686185229826e-06, "logits/chosen": -0.19370415806770325, "logits/rejected": -0.0161186084151268, "logps/chosen": -1.5457772016525269, "logps/rejected": -1.9858157634735107, "loss": 0.7568, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5457772016525269, "rewards/margins": 0.4400387704372406, "rewards/rejected": -1.9858157634735107, "sft_loss": 1.551759958267212, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 8.31494704459637, "learning_rate": 2.9935428069359103e-06, "logits/chosen": -0.06963808834552765, "logits/rejected": 0.02257571741938591, "logps/chosen": -1.538611888885498, "logps/rejected": -1.8600261211395264, "loss": 0.7655, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.538611888885498, "rewards/margins": 0.32141461968421936, "rewards/rejected": -1.8600261211395264, "sft_loss": 1.5300943851470947, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 12.46625655578267, "learning_rate": 2.9931025049899744e-06, "logits/chosen": -0.16380861401557922, "logits/rejected": 0.0008494898793287575, "logps/chosen": -1.6448299884796143, "logps/rejected": -1.9350553750991821, "loss": 0.791, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6448299884796143, "rewards/margins": 0.2902253568172455, "rewards/rejected": -1.9350553750991821, "sft_loss": 1.5562912225723267, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 7.382525036999301, "learning_rate": 2.9926477169569865e-06, "logits/chosen": -0.07919908314943314, "logits/rejected": 0.1075511947274208, "logps/chosen": -1.721343994140625, "logps/rejected": -2.0451881885528564, "loss": 0.8448, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.721343994140625, "rewards/margins": 0.32384395599365234, "rewards/rejected": -2.0451881885528564, "sft_loss": 1.6712074279785156, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 6.674161020309135, "learning_rate": 2.9921784472493023e-06, "logits/chosen": -0.16972795128822327, "logits/rejected": -0.03377728909254074, "logps/chosen": -1.4200472831726074, "logps/rejected": -1.7975457906723022, "loss": 0.7429, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4200472831726074, "rewards/margins": 0.3774986267089844, "rewards/rejected": -1.7975457906723022, "sft_loss": 1.4895946979522705, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 6.0055759116105785, "learning_rate": 2.9916947004197784e-06, "logits/chosen": -0.25825509428977966, "logits/rejected": -0.10662020742893219, "logps/chosen": -1.5190376043319702, "logps/rejected": -1.762810468673706, "loss": 0.8038, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5190376043319702, "rewards/margins": 0.2437729835510254, "rewards/rejected": -1.762810468673706, "sft_loss": 1.5322901010513306, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 5.420287019764337, "learning_rate": 2.9911964811617288e-06, "logits/chosen": -0.22347505390644073, "logits/rejected": -0.127300426363945, "logps/chosen": -1.530833125114441, "logps/rejected": -1.767128586769104, "loss": 0.8071, "rewards/accuracies": 0.625, "rewards/chosen": -1.530833125114441, "rewards/margins": 0.23629550635814667, "rewards/rejected": -1.767128586769104, "sft_loss": 1.5706127882003784, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 9.893102035807134, "learning_rate": 2.990683794308879e-06, "logits/chosen": -0.20974227786064148, "logits/rejected": -0.048324812203645706, "logps/chosen": -1.6057713031768799, "logps/rejected": -1.8644367456436157, "loss": 0.813, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6057713031768799, "rewards/margins": 0.25866541266441345, "rewards/rejected": -1.8644367456436157, "sft_loss": 1.5996134281158447, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 5.124644195602197, "learning_rate": 2.990156644835318e-06, "logits/chosen": -0.10994444787502289, "logits/rejected": -0.04099906235933304, "logps/chosen": -1.5882176160812378, "logps/rejected": -1.9525806903839111, "loss": 0.7969, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5882176160812378, "rewards/margins": 0.36436301469802856, "rewards/rejected": -1.9525806903839111, "sft_loss": 1.5708587169647217, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 4.277626498532557, "learning_rate": 2.989615037855454e-06, "logits/chosen": -0.20711994171142578, "logits/rejected": -0.0582943931221962, "logps/chosen": -1.5332410335540771, "logps/rejected": -1.9215202331542969, "loss": 0.7673, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5332410335540771, "rewards/margins": 0.3882790207862854, "rewards/rejected": -1.9215202331542969, "sft_loss": 1.548135757446289, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 6.281198041488248, "learning_rate": 2.98905897862396e-06, "logits/chosen": -0.13693243265151978, "logits/rejected": -0.014755794778466225, "logps/chosen": -1.5604966878890991, "logps/rejected": -1.7795257568359375, "loss": 0.8295, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5604966878890991, "rewards/margins": 0.21902918815612793, "rewards/rejected": -1.7795257568359375, "sft_loss": 1.5735958814620972, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 6.312433351409002, "learning_rate": 2.9884884725357237e-06, "logits/chosen": -0.2737407088279724, "logits/rejected": -0.20347031950950623, "logps/chosen": -1.5391143560409546, "logps/rejected": -1.8454147577285767, "loss": 0.7853, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5391143560409546, "rewards/margins": 0.3063003718852997, "rewards/rejected": -1.8454147577285767, "sft_loss": 1.5820717811584473, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 5.678304869448927, "learning_rate": 2.9879035251257994e-06, "logits/chosen": -0.2150966376066208, "logits/rejected": -0.129004567861557, "logps/chosen": -1.5198668241500854, "logps/rejected": -1.7554266452789307, "loss": 0.7937, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5198668241500854, "rewards/margins": 0.23555977642536163, "rewards/rejected": -1.7554266452789307, "sft_loss": 1.5115150213241577, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 6.440836345300536, "learning_rate": 2.9873041420693485e-06, "logits/chosen": -0.1084686741232872, "logits/rejected": 0.018399396911263466, "logps/chosen": -1.4943346977233887, "logps/rejected": -1.9200446605682373, "loss": 0.7576, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4943346977233887, "rewards/margins": 0.42570990324020386, "rewards/rejected": -1.9200446605682373, "sft_loss": 1.489121913909912, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 5.5172545468887035, "learning_rate": 2.9866903291815874e-06, "logits/chosen": -0.2743126451969147, "logits/rejected": -0.1022576093673706, "logps/chosen": -1.542785406112671, "logps/rejected": -1.8750007152557373, "loss": 0.792, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.542785406112671, "rewards/margins": 0.3322153389453888, "rewards/rejected": -1.8750007152557373, "sft_loss": 1.4891480207443237, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 4.461063714871936, "learning_rate": 2.986062092417733e-06, "logits/chosen": -0.34281599521636963, "logits/rejected": -0.2016495019197464, "logps/chosen": -1.4909117221832275, "logps/rejected": -1.7951453924179077, "loss": 0.7799, "rewards/accuracies": 0.625, "rewards/chosen": -1.4909117221832275, "rewards/margins": 0.30423372983932495, "rewards/rejected": -1.7951453924179077, "sft_loss": 1.5366796255111694, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 6.228539620529092, "learning_rate": 2.9854194378729402e-06, "logits/chosen": -0.2008173018693924, "logits/rejected": -0.06987674534320831, "logps/chosen": -1.5338191986083984, "logps/rejected": -1.9270120859146118, "loss": 0.7556, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5338191986083984, "rewards/margins": 0.39319270849227905, "rewards/rejected": -1.9270120859146118, "sft_loss": 1.5258783102035522, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 5.831894821473878, "learning_rate": 2.984762371782246e-06, "logits/chosen": -0.2618991732597351, "logits/rejected": -0.1274309605360031, "logps/chosen": -1.5179414749145508, "logps/rejected": -1.9443788528442383, "loss": 0.7495, "rewards/accuracies": 0.625, "rewards/chosen": -1.5179414749145508, "rewards/margins": 0.4264375567436218, "rewards/rejected": -1.9443788528442383, "sft_loss": 1.5014102458953857, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 10.386180837746535, "learning_rate": 2.9840909005205093e-06, "logits/chosen": -0.28757327795028687, "logits/rejected": -0.09849077463150024, "logps/chosen": -1.5623376369476318, "logps/rejected": -2.1427266597747803, "loss": 0.7518, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5623376369476318, "rewards/margins": 0.5803892016410828, "rewards/rejected": -2.1427266597747803, "sft_loss": 1.5638837814331055, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 6.533690446407395, "learning_rate": 2.9834050306023467e-06, "logits/chosen": -0.22992396354675293, "logits/rejected": -0.14586150646209717, "logps/chosen": -1.5596520900726318, "logps/rejected": -1.93422532081604, "loss": 0.7634, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5596520900726318, "rewards/margins": 0.374573290348053, "rewards/rejected": -1.93422532081604, "sft_loss": 1.5329387187957764, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.12956379354000092, "eval_logits/rejected": 0.2222956120967865, "eval_logps/chosen": -1.643437385559082, "eval_logps/rejected": -2.1177093982696533, "eval_loss": 0.7630000114440918, "eval_rewards/accuracies": 0.6461424231529236, "eval_rewards/chosen": -1.643437385559082, "eval_rewards/margins": 0.47427213191986084, "eval_rewards/rejected": -2.1177093982696533, "eval_runtime": 44.8355, "eval_samples_per_second": 29.999, "eval_sft_loss": 1.60084867477417, "eval_steps_per_second": 7.516, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 9.609958453098086, "learning_rate": 2.9827047686820714e-06, "logits/chosen": -0.254824697971344, "logits/rejected": -0.08545961230993271, "logps/chosen": -1.6291358470916748, "logps/rejected": -2.1718828678131104, "loss": 0.7416, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6291358470916748, "rewards/margins": 0.5427471995353699, "rewards/rejected": -2.1718828678131104, "sft_loss": 1.6229009628295898, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 9.759688116503279, "learning_rate": 2.981990121553627e-06, "logits/chosen": -0.14894555509090424, "logits/rejected": -0.07027033716440201, "logps/chosen": -1.6724026203155518, "logps/rejected": -2.1126809120178223, "loss": 0.773, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6724026203155518, "rewards/margins": 0.44027847051620483, "rewards/rejected": -2.1126809120178223, "sft_loss": 1.6621917486190796, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 9.224170221489683, "learning_rate": 2.9812610961505237e-06, "logits/chosen": -0.1752864271402359, "logits/rejected": -0.04700814187526703, "logps/chosen": -1.613806128501892, "logps/rejected": -2.180758237838745, "loss": 0.7705, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.613806128501892, "rewards/margins": 0.5669519305229187, "rewards/rejected": -2.180758237838745, "sft_loss": 1.6244399547576904, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 7.027893732458123, "learning_rate": 2.980517699545769e-06, "logits/chosen": -0.1193653792142868, "logits/rejected": -0.07351159304380417, "logps/chosen": -1.5781536102294922, "logps/rejected": -1.9520412683486938, "loss": 0.79, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5781536102294922, "rewards/margins": 0.37388795614242554, "rewards/rejected": -1.9520412683486938, "sft_loss": 1.575122356414795, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 6.4472721493338225, "learning_rate": 2.9797599389518003e-06, "logits/chosen": -0.15606620907783508, "logits/rejected": -0.03302518650889397, "logps/chosen": -1.4220941066741943, "logps/rejected": -1.8449456691741943, "loss": 0.7544, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4220941066741943, "rewards/margins": 0.42285171151161194, "rewards/rejected": -1.8449456691741943, "sft_loss": 1.5320099592208862, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 5.574111179052546, "learning_rate": 2.9789878217204138e-06, "logits/chosen": -0.07594827562570572, "logits/rejected": 0.08974309265613556, "logps/chosen": -1.534173607826233, "logps/rejected": -1.8344669342041016, "loss": 0.7843, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.534173607826233, "rewards/margins": 0.30029329657554626, "rewards/rejected": -1.8344669342041016, "sft_loss": 1.5193984508514404, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 6.863528532745195, "learning_rate": 2.9782013553426944e-06, "logits/chosen": -0.12380240112543106, "logits/rejected": 0.006033450365066528, "logps/chosen": -1.5036627054214478, "logps/rejected": -1.811418890953064, "loss": 0.798, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5036627054214478, "rewards/margins": 0.3077562153339386, "rewards/rejected": -1.811418890953064, "sft_loss": 1.5856822729110718, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 6.072297600231805, "learning_rate": 2.977400547448942e-06, "logits/chosen": -0.13113507628440857, "logits/rejected": 0.032567743211984634, "logps/chosen": -1.581300973892212, "logps/rejected": -1.9286623001098633, "loss": 0.8065, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.581300973892212, "rewards/margins": 0.347361296415329, "rewards/rejected": -1.9286623001098633, "sft_loss": 1.6395957469940186, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 4.453440470763169, "learning_rate": 2.976585405808599e-06, "logits/chosen": -0.08194790780544281, "logits/rejected": -0.013257568702101707, "logps/chosen": -1.526556134223938, "logps/rejected": -1.7669597864151, "loss": 0.833, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.526556134223938, "rewards/margins": 0.24040360748767853, "rewards/rejected": -1.7669597864151, "sft_loss": 1.6016641855239868, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 7.4831447992490405, "learning_rate": 2.9757559383301726e-06, "logits/chosen": -0.1410936713218689, "logits/rejected": -0.061996787786483765, "logps/chosen": -1.5362951755523682, "logps/rejected": -1.80612313747406, "loss": 0.775, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5362951755523682, "rewards/margins": 0.2698282301425934, "rewards/rejected": -1.80612313747406, "sft_loss": 1.5365614891052246, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 12.230990745949088, "learning_rate": 2.9749121530611605e-06, "logits/chosen": -0.16171444952487946, "logits/rejected": 0.006842072121798992, "logps/chosen": -1.5803025960922241, "logps/rejected": -2.0116724967956543, "loss": 0.7973, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5803025960922241, "rewards/margins": 0.43136996030807495, "rewards/rejected": -2.0116724967956543, "sft_loss": 1.563303828239441, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 6.27963698166502, "learning_rate": 2.97405405818797e-06, "logits/chosen": -0.2611856758594513, "logits/rejected": -0.09050299972295761, "logps/chosen": -1.6014102697372437, "logps/rejected": -2.0758275985717773, "loss": 0.7568, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.6014102697372437, "rewards/margins": 0.4744173586368561, "rewards/rejected": -2.0758275985717773, "sft_loss": 1.601758360862732, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 14.12800084055978, "learning_rate": 2.9731816620358426e-06, "logits/chosen": -0.15563152730464935, "logits/rejected": -0.04933555796742439, "logps/chosen": -1.5415546894073486, "logps/rejected": -2.029212474822998, "loss": 0.7759, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5415546894073486, "rewards/margins": 0.4876578450202942, "rewards/rejected": -2.029212474822998, "sft_loss": 1.5229319334030151, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 5.107147918566259, "learning_rate": 2.9722949730687687e-06, "logits/chosen": -0.25618550181388855, "logits/rejected": 0.02996593713760376, "logps/chosen": -1.5668599605560303, "logps/rejected": -2.001129627227783, "loss": 0.7716, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5668599605560303, "rewards/margins": 0.43426984548568726, "rewards/rejected": -2.001129627227783, "sft_loss": 1.6111844778060913, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 7.689311016766697, "learning_rate": 2.9713939998894087e-06, "logits/chosen": -0.16601407527923584, "logits/rejected": -0.0825173631310463, "logps/chosen": -1.637319803237915, "logps/rejected": -1.9128596782684326, "loss": 0.8601, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.637319803237915, "rewards/margins": 0.2755400538444519, "rewards/rejected": -1.9128596782684326, "sft_loss": 1.6005403995513916, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 5.822451405492217, "learning_rate": 2.970478751239009e-06, "logits/chosen": -0.16296057403087616, "logits/rejected": 0.015431973151862621, "logps/chosen": -1.6332006454467773, "logps/rejected": -2.022491455078125, "loss": 0.7748, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6332006454467773, "rewards/margins": 0.38929063081741333, "rewards/rejected": -2.022491455078125, "sft_loss": 1.5428736209869385, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 7.1342182098162565, "learning_rate": 2.9695492359973153e-06, "logits/chosen": -0.23076090216636658, "logits/rejected": -0.1467093527317047, "logps/chosen": -1.5775783061981201, "logps/rejected": -1.9766054153442383, "loss": 0.7444, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5775783061981201, "rewards/margins": 0.39902693033218384, "rewards/rejected": -1.9766054153442383, "sft_loss": 1.5582143068313599, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 4.5065990404811584, "learning_rate": 2.9686054631824884e-06, "logits/chosen": -0.3198297619819641, "logits/rejected": -0.1856500804424286, "logps/chosen": -1.5775845050811768, "logps/rejected": -1.9523508548736572, "loss": 0.776, "rewards/accuracies": 0.625, "rewards/chosen": -1.5775845050811768, "rewards/margins": 0.3747663199901581, "rewards/rejected": -1.9523508548736572, "sft_loss": 1.6158215999603271, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 5.474880741824772, "learning_rate": 2.9676474419510175e-06, "logits/chosen": -0.08876131474971771, "logits/rejected": 0.02842838689684868, "logps/chosen": -1.472720980644226, "logps/rejected": -1.763304352760315, "loss": 0.78, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.472720980644226, "rewards/margins": 0.29058313369750977, "rewards/rejected": -1.763304352760315, "sft_loss": 1.5007354021072388, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 5.201428854120871, "learning_rate": 2.966675181597627e-06, "logits/chosen": -0.21010169386863708, "logits/rejected": -0.13299232721328735, "logps/chosen": -1.471938967704773, "logps/rejected": -1.901342749595642, "loss": 0.7557, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.471938967704773, "rewards/margins": 0.4294038712978363, "rewards/rejected": -1.901342749595642, "sft_loss": 1.4958593845367432, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 6.6367592511228155, "learning_rate": 2.965688691555193e-06, "logits/chosen": -0.12889441847801208, "logits/rejected": 0.07438662648200989, "logps/chosen": -1.6089328527450562, "logps/rejected": -2.049424886703491, "loss": 0.7874, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6089328527450562, "rewards/margins": 0.44049209356307983, "rewards/rejected": -2.049424886703491, "sft_loss": 1.6768449544906616, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 4.574768350267725, "learning_rate": 2.964687981394644e-06, "logits/chosen": -0.16920894384384155, "logits/rejected": -0.05619668960571289, "logps/chosen": -1.6071586608886719, "logps/rejected": -1.8837287425994873, "loss": 0.8244, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.6071586608886719, "rewards/margins": 0.2765699028968811, "rewards/rejected": -1.8837287425994873, "sft_loss": 1.5901567935943604, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 6.431528305861505, "learning_rate": 2.963673060824877e-06, "logits/chosen": -0.19680675864219666, "logits/rejected": 0.0035483776591718197, "logps/chosen": -1.5793983936309814, "logps/rejected": -1.9441983699798584, "loss": 0.7786, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5793983936309814, "rewards/margins": 0.364799827337265, "rewards/rejected": -1.9441983699798584, "sft_loss": 1.553386926651001, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 6.012695412310331, "learning_rate": 2.9626439396926536e-06, "logits/chosen": -0.04484427347779274, "logits/rejected": 0.11925461143255234, "logps/chosen": -1.4713695049285889, "logps/rejected": -1.9322984218597412, "loss": 0.767, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4713695049285889, "rewards/margins": 0.46092891693115234, "rewards/rejected": -1.9322984218597412, "sft_loss": 1.53690505027771, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 10.552874460492138, "learning_rate": 2.9616006279825125e-06, "logits/chosen": -0.18911555409431458, "logits/rejected": -0.0048009916208684444, "logps/chosen": -1.656640648841858, "logps/rejected": -2.062116861343384, "loss": 0.775, "rewards/accuracies": 0.625, "rewards/chosen": -1.656640648841858, "rewards/margins": 0.4054761826992035, "rewards/rejected": -2.062116861343384, "sft_loss": 1.6129440069198608, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 9.290842105366092, "learning_rate": 2.9605431358166687e-06, "logits/chosen": -0.19927628338336945, "logits/rejected": -0.07802639156579971, "logps/chosen": -1.581866979598999, "logps/rejected": -2.1346230506896973, "loss": 0.7585, "rewards/accuracies": 0.625, "rewards/chosen": -1.581866979598999, "rewards/margins": 0.5527556538581848, "rewards/rejected": -2.1346230506896973, "sft_loss": 1.5932109355926514, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 6.556638001860026, "learning_rate": 2.959471473454915e-06, "logits/chosen": -0.1524762213230133, "logits/rejected": -0.10123306512832642, "logps/chosen": -1.6552999019622803, "logps/rejected": -2.068732500076294, "loss": 0.7793, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6552999019622803, "rewards/margins": 0.41343265771865845, "rewards/rejected": -2.068732500076294, "sft_loss": 1.6616418361663818, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 7.961322356971732, "learning_rate": 2.9583856512945257e-06, "logits/chosen": -0.1716580092906952, "logits/rejected": -0.04362250119447708, "logps/chosen": -1.6354115009307861, "logps/rejected": -2.064568519592285, "loss": 0.7676, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6354115009307861, "rewards/margins": 0.4291567802429199, "rewards/rejected": -2.064568519592285, "sft_loss": 1.6227165460586548, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 9.101576014649456, "learning_rate": 2.957285679870151e-06, "logits/chosen": -0.20420575141906738, "logits/rejected": -0.04625245928764343, "logps/chosen": -1.6196963787078857, "logps/rejected": -2.151869297027588, "loss": 0.7236, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6196963787078857, "rewards/margins": 0.5321725010871887, "rewards/rejected": -2.151869297027588, "sft_loss": 1.5830497741699219, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 6.42046355799275, "learning_rate": 2.9561715698537184e-06, "logits/chosen": -0.18746407330036163, "logits/rejected": 0.010445961728692055, "logps/chosen": -1.6953752040863037, "logps/rejected": -2.0557973384857178, "loss": 0.8408, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.6953752040863037, "rewards/margins": 0.3604220747947693, "rewards/rejected": -2.0557973384857178, "sft_loss": 1.6522247791290283, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 6.449408194941958, "learning_rate": 2.955043332054329e-06, "logits/chosen": -0.09924498200416565, "logits/rejected": 0.15976925194263458, "logps/chosen": -1.6739656925201416, "logps/rejected": -2.0317275524139404, "loss": 0.8074, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.6739656925201416, "rewards/margins": 0.3577619194984436, "rewards/rejected": -2.0317275524139404, "sft_loss": 1.6839134693145752, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 5.321255191283596, "learning_rate": 2.95390097741815e-06, "logits/chosen": -0.12195520102977753, "logits/rejected": 0.0469212606549263, "logps/chosen": -1.6282774209976196, "logps/rejected": -1.9903730154037476, "loss": 0.7827, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6282774209976196, "rewards/margins": 0.36209559440612793, "rewards/rejected": -1.9903730154037476, "sft_loss": 1.6381866931915283, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 6.7507291304657455, "learning_rate": 2.952744517028312e-06, "logits/chosen": -0.015586107969284058, "logits/rejected": -0.0066694943234324455, "logps/chosen": -1.693063497543335, "logps/rejected": -2.098609209060669, "loss": 0.7952, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.693063497543335, "rewards/margins": 0.4055456519126892, "rewards/rejected": -2.098609209060669, "sft_loss": 1.7138296365737915, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 5.7441300960801795, "learning_rate": 2.951573962104798e-06, "logits/chosen": -0.013513225130736828, "logits/rejected": -0.0030169696547091007, "logps/chosen": -1.5411711931228638, "logps/rejected": -1.8659536838531494, "loss": 0.788, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5411711931228638, "rewards/margins": 0.32478243112564087, "rewards/rejected": -1.8659536838531494, "sft_loss": 1.5452463626861572, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 6.264181623160948, "learning_rate": 2.950389324004337e-06, "logits/chosen": -0.13159170746803284, "logits/rejected": 0.10122290998697281, "logps/chosen": -1.6037967205047607, "logps/rejected": -1.958993673324585, "loss": 0.7625, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6037967205047607, "rewards/margins": 0.3551969528198242, "rewards/rejected": -1.958993673324585, "sft_loss": 1.6397666931152344, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 8.960837585492582, "learning_rate": 2.949190614220294e-06, "logits/chosen": -0.1174493283033371, "logits/rejected": 0.11396725475788116, "logps/chosen": -1.6792113780975342, "logps/rejected": -2.021131992340088, "loss": 0.8056, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6792113780975342, "rewards/margins": 0.3419206738471985, "rewards/rejected": -2.021131992340088, "sft_loss": 1.6655311584472656, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 7.038292814007257, "learning_rate": 2.9479778443825553e-06, "logits/chosen": 0.003808742854744196, "logits/rejected": 0.24790827929973602, "logps/chosen": -1.6043720245361328, "logps/rejected": -1.9450041055679321, "loss": 0.7922, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6043720245361328, "rewards/margins": 0.3406318724155426, "rewards/rejected": -1.9450041055679321, "sft_loss": 1.6884558200836182, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 7.6017791758830215, "learning_rate": 2.9467510262574204e-06, "logits/chosen": 0.10805141925811768, "logits/rejected": 0.13546641170978546, "logps/chosen": -1.4662773609161377, "logps/rejected": -1.9129825830459595, "loss": 0.7364, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4662773609161377, "rewards/margins": 0.44670534133911133, "rewards/rejected": -1.9129825830459595, "sft_loss": 1.5612038373947144, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 6.2631760907636576, "learning_rate": 2.9455101717474834e-06, "logits/chosen": 0.095148466527462, "logits/rejected": 0.19982023537158966, "logps/chosen": -1.5556526184082031, "logps/rejected": -1.8492791652679443, "loss": 0.8283, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.5556526184082031, "rewards/margins": 0.2936265468597412, "rewards/rejected": -1.8492791652679443, "sft_loss": 1.6193443536758423, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 6.357423162704157, "learning_rate": 2.9442552928915203e-06, "logits/chosen": 0.0740412250161171, "logits/rejected": 0.25112438201904297, "logps/chosen": -1.5824158191680908, "logps/rejected": -2.0103745460510254, "loss": 0.7808, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5824158191680908, "rewards/margins": 0.42795872688293457, "rewards/rejected": -2.0103745460510254, "sft_loss": 1.619419813156128, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 8.362648905453156, "learning_rate": 2.942986401864371e-06, "logits/chosen": 0.06260992586612701, "logits/rejected": 0.28578799962997437, "logps/chosen": -1.686570167541504, "logps/rejected": -2.054899215698242, "loss": 0.8101, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.686570167541504, "rewards/margins": 0.3683289885520935, "rewards/rejected": -2.054899215698242, "sft_loss": 1.7220935821533203, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 6.220464946890453, "learning_rate": 2.9417035109768225e-06, "logits/chosen": 0.0265726987272501, "logits/rejected": 0.27460747957229614, "logps/chosen": -1.4630142450332642, "logps/rejected": -1.9338045120239258, "loss": 0.7449, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4630142450332642, "rewards/margins": 0.470790296792984, "rewards/rejected": -1.9338045120239258, "sft_loss": 1.490442156791687, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 6.538918997856583, "learning_rate": 2.9404066326754874e-06, "logits/chosen": 0.04017296060919762, "logits/rejected": 0.27316930890083313, "logps/chosen": -1.5119431018829346, "logps/rejected": -1.8872439861297607, "loss": 0.7696, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5119431018829346, "rewards/margins": 0.3753008246421814, "rewards/rejected": -1.8872439861297607, "sft_loss": 1.5683563947677612, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 6.4969536058578345, "learning_rate": 2.9390957795426847e-06, "logits/chosen": 0.04273226857185364, "logits/rejected": 0.24941368401050568, "logps/chosen": -1.5459063053131104, "logps/rejected": -1.9651594161987305, "loss": 0.7461, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5459063053131104, "rewards/margins": 0.4192531108856201, "rewards/rejected": -1.9651594161987305, "sft_loss": 1.6078563928604126, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 7.022785391979763, "learning_rate": 2.9377709642963177e-06, "logits/chosen": 0.0010808638762682676, "logits/rejected": 0.1716040074825287, "logps/chosen": -1.5438032150268555, "logps/rejected": -2.0570969581604004, "loss": 0.738, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5438032150268555, "rewards/margins": 0.5132937431335449, "rewards/rejected": -2.0570969581604004, "sft_loss": 1.6060693264007568, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 9.420728979604682, "learning_rate": 2.9364321997897485e-06, "logits/chosen": 0.03571401163935661, "logits/rejected": 0.14158421754837036, "logps/chosen": -1.6367580890655518, "logps/rejected": -1.9894298315048218, "loss": 0.8179, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6367580890655518, "rewards/margins": 0.35267171263694763, "rewards/rejected": -1.9894298315048218, "sft_loss": 1.6678476333618164, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 6.60932904173704, "learning_rate": 2.935079499011677e-06, "logits/chosen": -0.013311957940459251, "logits/rejected": 0.1487562358379364, "logps/chosen": -1.6255792379379272, "logps/rejected": -1.8736343383789062, "loss": 0.816, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6255792379379272, "rewards/margins": 0.2480551302433014, "rewards/rejected": -1.8736343383789062, "sft_loss": 1.628200888633728, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 8.164177080214404, "learning_rate": 2.9337128750860126e-06, "logits/chosen": 0.03370757773518562, "logits/rejected": 0.25221627950668335, "logps/chosen": -1.501709222793579, "logps/rejected": -1.8876798152923584, "loss": 0.7654, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.501709222793579, "rewards/margins": 0.38597044348716736, "rewards/rejected": -1.8876798152923584, "sft_loss": 1.5484182834625244, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 4.63992866038779, "learning_rate": 2.932332341271746e-06, "logits/chosen": -0.0409303642809391, "logits/rejected": 0.15158711373806, "logps/chosen": -1.5028371810913086, "logps/rejected": -1.949318289756775, "loss": 0.756, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5028371810913086, "rewards/margins": 0.4464810788631439, "rewards/rejected": -1.949318289756775, "sft_loss": 1.5938483476638794, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 6.014972415086379, "learning_rate": 2.930937910962822e-06, "logits/chosen": -0.07076011598110199, "logits/rejected": 0.06818946450948715, "logps/chosen": -1.6399303674697876, "logps/rejected": -2.096925973892212, "loss": 0.7769, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6399303674697876, "rewards/margins": 0.45699548721313477, "rewards/rejected": -2.096925973892212, "sft_loss": 1.6774276494979858, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 8.395683170770335, "learning_rate": 2.9295295976880107e-06, "logits/chosen": 0.025820106267929077, "logits/rejected": 0.11879418045282364, "logps/chosen": -1.6487575769424438, "logps/rejected": -2.074948787689209, "loss": 0.7623, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6487575769424438, "rewards/margins": 0.4261912405490875, "rewards/rejected": -2.074948787689209, "sft_loss": 1.6632426977157593, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 7.7402768083105125, "learning_rate": 2.9281074151107727e-06, "logits/chosen": 0.029402291402220726, "logits/rejected": 0.2545176148414612, "logps/chosen": -1.71028733253479, "logps/rejected": -2.077282428741455, "loss": 0.7812, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.71028733253479, "rewards/margins": 0.36699482798576355, "rewards/rejected": -2.077282428741455, "sft_loss": 1.6955442428588867, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 8.751835002398574, "learning_rate": 2.926671377029129e-06, "logits/chosen": 0.03397374227643013, "logits/rejected": 0.2006794512271881, "logps/chosen": -1.6462209224700928, "logps/rejected": -2.1480746269226074, "loss": 0.751, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6462209224700928, "rewards/margins": 0.5018535256385803, "rewards/rejected": -2.1480746269226074, "sft_loss": 1.752246618270874, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 5.540643425073099, "learning_rate": 2.9252214973755294e-06, "logits/chosen": -0.1207038015127182, "logits/rejected": 0.19626577198505402, "logps/chosen": -1.6974194049835205, "logps/rejected": -2.197737216949463, "loss": 0.7394, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6974194049835205, "rewards/margins": 0.5003179311752319, "rewards/rejected": -2.197737216949463, "sft_loss": 1.7291381359100342, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 5.055148253174504, "learning_rate": 2.923757790216711e-06, "logits/chosen": -0.024127524346113205, "logits/rejected": 0.15421162545681, "logps/chosen": -1.6369282007217407, "logps/rejected": -2.1902050971984863, "loss": 0.7588, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6369282007217407, "rewards/margins": 0.5532768368721008, "rewards/rejected": -2.1902050971984863, "sft_loss": 1.687150001525879, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 8.581801582300574, "learning_rate": 2.922280269753568e-06, "logits/chosen": -0.10929293930530548, "logits/rejected": 0.02730230614542961, "logps/chosen": -1.718034029006958, "logps/rejected": -2.0976719856262207, "loss": 0.8, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.718034029006958, "rewards/margins": 0.37963828444480896, "rewards/rejected": -2.0976719856262207, "sft_loss": 1.7732467651367188, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 8.258353844488562, "learning_rate": 2.9207889503210094e-06, "logits/chosen": 0.008437035605311394, "logits/rejected": 0.232884019613266, "logps/chosen": -1.6477196216583252, "logps/rejected": -1.899294137954712, "loss": 0.8345, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.6477196216583252, "rewards/margins": 0.2515743374824524, "rewards/rejected": -1.899294137954712, "sft_loss": 1.666416883468628, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 5.123591274328121, "learning_rate": 2.9192838463878236e-06, "logits/chosen": -0.015348012559115887, "logits/rejected": 0.1270010769367218, "logps/chosen": -1.6096594333648682, "logps/rejected": -1.8980051279067993, "loss": 0.8205, "rewards/accuracies": 0.625, "rewards/chosen": -1.6096594333648682, "rewards/margins": 0.2883456349372864, "rewards/rejected": -1.8980051279067993, "sft_loss": 1.6213926076889038, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 6.556141747620839, "learning_rate": 2.917764972556535e-06, "logits/chosen": -0.17863938212394714, "logits/rejected": 0.011336810886859894, "logps/chosen": -1.5950143337249756, "logps/rejected": -2.0163919925689697, "loss": 0.7568, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5950143337249756, "rewards/margins": 0.421377569437027, "rewards/rejected": -2.0163919925689697, "sft_loss": 1.6503280401229858, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 6.749858086812364, "learning_rate": 2.9162323435632657e-06, "logits/chosen": -0.027099858969449997, "logits/rejected": 0.11431686580181122, "logps/chosen": -1.5036207437515259, "logps/rejected": -2.1065268516540527, "loss": 0.7322, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5036207437515259, "rewards/margins": 0.6029061079025269, "rewards/rejected": -2.1065268516540527, "sft_loss": 1.5539976358413696, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 7.768714321436251, "learning_rate": 2.914685974277587e-06, "logits/chosen": -0.11938049644231796, "logits/rejected": -0.018264098092913628, "logps/chosen": -1.6016819477081299, "logps/rejected": -1.9748432636260986, "loss": 0.7893, "rewards/accuracies": 0.625, "rewards/chosen": -1.6016819477081299, "rewards/margins": 0.3731613755226135, "rewards/rejected": -1.9748432636260986, "sft_loss": 1.5768858194351196, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 7.446064522717903, "learning_rate": 2.9131258797023814e-06, "logits/chosen": -0.03684517741203308, "logits/rejected": 0.1460929960012436, "logps/chosen": -1.5739840269088745, "logps/rejected": -1.9133260250091553, "loss": 0.7733, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5739840269088745, "rewards/margins": 0.3393420875072479, "rewards/rejected": -1.9133260250091553, "sft_loss": 1.6007308959960938, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 6.611446978075237, "learning_rate": 2.9115520749736934e-06, "logits/chosen": 0.03872992843389511, "logits/rejected": 0.23826126754283905, "logps/chosen": -1.5514007806777954, "logps/rejected": -2.116300106048584, "loss": 0.7284, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5514007806777954, "rewards/margins": 0.5648993253707886, "rewards/rejected": -2.116300106048584, "sft_loss": 1.5213690996170044, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 5.789445436806434, "learning_rate": 2.909964575360583e-06, "logits/chosen": -0.1557280272245407, "logits/rejected": -0.0005622118478640914, "logps/chosen": -1.5989017486572266, "logps/rejected": -2.1404976844787598, "loss": 0.7563, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5989017486572266, "rewards/margins": 0.541595995426178, "rewards/rejected": -2.1404976844787598, "sft_loss": 1.6561466455459595, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 8.153135086048819, "learning_rate": 2.9083633962649783e-06, "logits/chosen": -0.1161702498793602, "logits/rejected": 0.13406352698802948, "logps/chosen": -1.603650450706482, "logps/rejected": -2.1677873134613037, "loss": 0.7233, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.603650450706482, "rewards/margins": 0.5641368627548218, "rewards/rejected": -2.1677873134613037, "sft_loss": 1.594813585281372, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 8.061547650135914, "learning_rate": 2.906748553221527e-06, "logits/chosen": 0.09124667942523956, "logits/rejected": 0.18374478816986084, "logps/chosen": -1.6448787450790405, "logps/rejected": -2.11365008354187, "loss": 0.7742, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6448787450790405, "rewards/margins": 0.468771368265152, "rewards/rejected": -2.11365008354187, "sft_loss": 1.5670969486236572, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 6.680659188192423, "learning_rate": 2.9051200618974418e-06, "logits/chosen": -0.06118954345583916, "logits/rejected": 0.16990908980369568, "logps/chosen": -1.720273733139038, "logps/rejected": -2.3065876960754395, "loss": 0.7217, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.720273733139038, "rewards/margins": 0.5863139033317566, "rewards/rejected": -2.3065876960754395, "sft_loss": 1.604292869567871, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 7.164458316966575, "learning_rate": 2.903477938092354e-06, "logits/chosen": -0.04646755009889603, "logits/rejected": 0.010129129514098167, "logps/chosen": -1.670170545578003, "logps/rejected": -1.9611806869506836, "loss": 0.8287, "rewards/accuracies": 0.59375, "rewards/chosen": -1.670170545578003, "rewards/margins": 0.2910100519657135, "rewards/rejected": -1.9611806869506836, "sft_loss": 1.7245380878448486, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 6.094378907879845, "learning_rate": 2.901822197738155e-06, "logits/chosen": -0.21642693877220154, "logits/rejected": -0.050178252160549164, "logps/chosen": -1.6112639904022217, "logps/rejected": -2.1255059242248535, "loss": 0.7721, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6112639904022217, "rewards/margins": 0.5142418146133423, "rewards/rejected": -2.1255059242248535, "sft_loss": 1.666547179222107, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 7.323723960289789, "learning_rate": 2.9001528568988454e-06, "logits/chosen": -0.18746520578861237, "logits/rejected": -0.022803576663136482, "logps/chosen": -1.5549800395965576, "logps/rejected": -2.027247905731201, "loss": 0.7463, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5549800395965576, "rewards/margins": 0.4722679555416107, "rewards/rejected": -2.027247905731201, "sft_loss": 1.4974477291107178, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 6.302595786091669, "learning_rate": 2.898469931770378e-06, "logits/chosen": -0.06833770126104355, "logits/rejected": 0.04650373011827469, "logps/chosen": -1.5727870464324951, "logps/rejected": -1.8890917301177979, "loss": 0.7904, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5727870464324951, "rewards/margins": 0.3163047134876251, "rewards/rejected": -1.8890917301177979, "sft_loss": 1.5973747968673706, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 5.606944351727298, "learning_rate": 2.896773438680498e-06, "logits/chosen": -0.015208597294986248, "logits/rejected": 0.1008622795343399, "logps/chosen": -1.539618968963623, "logps/rejected": -2.0273754596710205, "loss": 0.7368, "rewards/accuracies": 0.71875, "rewards/chosen": -1.539618968963623, "rewards/margins": 0.4877566397190094, "rewards/rejected": -2.0273754596710205, "sft_loss": 1.613294243812561, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 7.726677796070765, "learning_rate": 2.8950633940885908e-06, "logits/chosen": -0.08709158003330231, "logits/rejected": 0.01412753202021122, "logps/chosen": -1.5183520317077637, "logps/rejected": -1.8736242055892944, "loss": 0.7847, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5183520317077637, "rewards/margins": 0.3552722632884979, "rewards/rejected": -1.8736242055892944, "sft_loss": 1.5644044876098633, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 5.49002962622439, "learning_rate": 2.893339814585516e-06, "logits/chosen": -0.123673215508461, "logits/rejected": 0.0587911494076252, "logps/chosen": -1.8219196796417236, "logps/rejected": -2.1566295623779297, "loss": 0.835, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.8219196796417236, "rewards/margins": 0.3347100615501404, "rewards/rejected": -2.1566295623779297, "sft_loss": 1.7723875045776367, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 5.3585714206015815, "learning_rate": 2.8916027168934483e-06, "logits/chosen": -0.13725906610488892, "logits/rejected": 0.07069502025842667, "logps/chosen": -1.5604981184005737, "logps/rejected": -1.9629894495010376, "loss": 0.7946, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5604981184005737, "rewards/margins": 0.40249133110046387, "rewards/rejected": -1.9629894495010376, "sft_loss": 1.6075115203857422, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 5.74098336279485, "learning_rate": 2.889852117865718e-06, "logits/chosen": -0.18351632356643677, "logits/rejected": -0.016420545056462288, "logps/chosen": -1.703129529953003, "logps/rejected": -2.1396589279174805, "loss": 0.7581, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.703129529953003, "rewards/margins": 0.4365292489528656, "rewards/rejected": -2.1396589279174805, "sft_loss": 1.6762657165527344, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 7.0558580015091845, "learning_rate": 2.888088034486645e-06, "logits/chosen": -0.09006930887699127, "logits/rejected": 0.07247094810009003, "logps/chosen": -1.6955833435058594, "logps/rejected": -2.146712303161621, "loss": 0.78, "rewards/accuracies": 0.625, "rewards/chosen": -1.6955833435058594, "rewards/margins": 0.4511287212371826, "rewards/rejected": -2.146712303161621, "sft_loss": 1.6365978717803955, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 8.2556377644557, "learning_rate": 2.886310483871373e-06, "logits/chosen": -0.18919400870800018, "logits/rejected": -0.01640934683382511, "logps/chosen": -1.6706031560897827, "logps/rejected": -2.201122522354126, "loss": 0.7321, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6706031560897827, "rewards/margins": 0.5305193662643433, "rewards/rejected": -2.201122522354126, "sft_loss": 1.6959350109100342, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 6.3300107901147245, "learning_rate": 2.8845194832657067e-06, "logits/chosen": -0.11349854618310928, "logits/rejected": 0.02640039101243019, "logps/chosen": -1.5338860750198364, "logps/rejected": -2.1164541244506836, "loss": 0.7103, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5338860750198364, "rewards/margins": 0.5825680494308472, "rewards/rejected": -2.1164541244506836, "sft_loss": 1.634878158569336, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 9.000964400043555, "learning_rate": 2.882715050045941e-06, "logits/chosen": -0.14477837085723877, "logits/rejected": -0.06517032533884048, "logps/chosen": -1.6217753887176514, "logps/rejected": -2.018050193786621, "loss": 0.7941, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6217753887176514, "rewards/margins": 0.3962748050689697, "rewards/rejected": -2.018050193786621, "sft_loss": 1.6124019622802734, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": 0.30408504605293274, "eval_logits/rejected": 0.417248010635376, "eval_logps/chosen": -1.6028857231140137, "eval_logps/rejected": -2.1043779850006104, "eval_loss": 0.744583249092102, "eval_rewards/accuracies": 0.6750741600990295, "eval_rewards/chosen": -1.6028857231140137, "eval_rewards/margins": 0.5014922618865967, "eval_rewards/rejected": -2.1043779850006104, "eval_runtime": 45.0345, "eval_samples_per_second": 29.866, "eval_sft_loss": 1.6243925094604492, "eval_steps_per_second": 7.483, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 14.93244550145814, "learning_rate": 2.8808972017186957e-06, "logits/chosen": -0.23679903149604797, "logits/rejected": -0.015266534872353077, "logps/chosen": -1.5985362529754639, "logps/rejected": -2.0280582904815674, "loss": 0.7523, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5985362529754639, "rewards/margins": 0.4295217990875244, "rewards/rejected": -2.0280582904815674, "sft_loss": 1.6371889114379883, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 7.8160679964001405, "learning_rate": 2.8790659559207434e-06, "logits/chosen": -0.16882798075675964, "logits/rejected": 0.08145350217819214, "logps/chosen": -1.5531269311904907, "logps/rejected": -1.9778798818588257, "loss": 0.7507, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5531269311904907, "rewards/margins": 0.42475301027297974, "rewards/rejected": -1.9778798818588257, "sft_loss": 1.5700221061706543, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 8.18777345683683, "learning_rate": 2.877221330418838e-06, "logits/chosen": -0.2044234722852707, "logits/rejected": -0.053410958498716354, "logps/chosen": -1.627963662147522, "logps/rejected": -1.924864411354065, "loss": 0.811, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.627963662147522, "rewards/margins": 0.29690080881118774, "rewards/rejected": -1.924864411354065, "sft_loss": 1.6557378768920898, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 7.985816202450262, "learning_rate": 2.875363343109545e-06, "logits/chosen": 0.01616957038640976, "logits/rejected": 0.1678108274936676, "logps/chosen": -1.5112775564193726, "logps/rejected": -1.9317729473114014, "loss": 0.7606, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5112775564193726, "rewards/margins": 0.42049551010131836, "rewards/rejected": -1.9317729473114014, "sft_loss": 1.5209019184112549, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 7.040315791608339, "learning_rate": 2.8734920120190645e-06, "logits/chosen": -0.2711160480976105, "logits/rejected": -0.0027374387718737125, "logps/chosen": -1.6637003421783447, "logps/rejected": -2.088355302810669, "loss": 0.774, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6637003421783447, "rewards/margins": 0.42465487122535706, "rewards/rejected": -2.088355302810669, "sft_loss": 1.6742616891860962, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 7.348545963113847, "learning_rate": 2.8716073553030593e-06, "logits/chosen": -0.12881150841712952, "logits/rejected": 0.00364801287651062, "logps/chosen": -1.654126763343811, "logps/rejected": -2.066636562347412, "loss": 0.7908, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.654126763343811, "rewards/margins": 0.4125099182128906, "rewards/rejected": -2.066636562347412, "sft_loss": 1.6164060831069946, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 6.3291520509791335, "learning_rate": 2.8697093912464782e-06, "logits/chosen": -0.13783837854862213, "logits/rejected": 0.0442417673766613, "logps/chosen": -1.6532104015350342, "logps/rejected": -2.0197830200195312, "loss": 0.7792, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6532104015350342, "rewards/margins": 0.3665724992752075, "rewards/rejected": -2.0197830200195312, "sft_loss": 1.7437490224838257, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 7.43163637379486, "learning_rate": 2.8677981382633753e-06, "logits/chosen": -0.2456088811159134, "logits/rejected": -0.07947979867458344, "logps/chosen": -1.6115837097167969, "logps/rejected": -2.0740396976470947, "loss": 0.7513, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6115837097167969, "rewards/margins": 0.4624561369419098, "rewards/rejected": -2.0740396976470947, "sft_loss": 1.6971622705459595, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 7.948237008152092, "learning_rate": 2.8658736148967366e-06, "logits/chosen": -0.1307731419801712, "logits/rejected": 0.09868714213371277, "logps/chosen": -1.650904655456543, "logps/rejected": -2.1201839447021484, "loss": 0.7719, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.650904655456543, "rewards/margins": 0.46927928924560547, "rewards/rejected": -2.1201839447021484, "sft_loss": 1.6839383840560913, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 9.092640065524614, "learning_rate": 2.8639358398182947e-06, "logits/chosen": -0.12447915971279144, "logits/rejected": 0.12984387576580048, "logps/chosen": -1.7692009210586548, "logps/rejected": -2.0938119888305664, "loss": 0.8577, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.7692009210586548, "rewards/margins": 0.324611097574234, "rewards/rejected": -2.0938119888305664, "sft_loss": 1.7437636852264404, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 7.49659909202242, "learning_rate": 2.8619848318283538e-06, "logits/chosen": -0.16396114230155945, "logits/rejected": -0.017033612355589867, "logps/chosen": -1.546359658241272, "logps/rejected": -1.9179470539093018, "loss": 0.7799, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.546359658241272, "rewards/margins": 0.37158721685409546, "rewards/rejected": -1.9179470539093018, "sft_loss": 1.6477253437042236, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 6.720611032104559, "learning_rate": 2.860020609855601e-06, "logits/chosen": -0.20362278819084167, "logits/rejected": -0.04654115065932274, "logps/chosen": -1.5199604034423828, "logps/rejected": -2.017423391342163, "loss": 0.7452, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5199604034423828, "rewards/margins": 0.4974629878997803, "rewards/rejected": -2.017423391342163, "sft_loss": 1.5844711065292358, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 5.9415895380295085, "learning_rate": 2.858043192956926e-06, "logits/chosen": -0.12424206733703613, "logits/rejected": 0.06748121976852417, "logps/chosen": -1.5377591848373413, "logps/rejected": -1.9634453058242798, "loss": 0.7536, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5377591848373413, "rewards/margins": 0.4256861209869385, "rewards/rejected": -1.9634453058242798, "sft_loss": 1.579796552658081, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 5.826261694860266, "learning_rate": 2.856052600317237e-06, "logits/chosen": -0.2191907912492752, "logits/rejected": -0.10827469825744629, "logps/chosen": -1.5648930072784424, "logps/rejected": -1.954262137413025, "loss": 0.7743, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5648930072784424, "rewards/margins": 0.38936907052993774, "rewards/rejected": -1.954262137413025, "sft_loss": 1.5946085453033447, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 6.307597170102964, "learning_rate": 2.8540488512492725e-06, "logits/chosen": -0.2155335247516632, "logits/rejected": -0.09447634220123291, "logps/chosen": -1.6340783834457397, "logps/rejected": -2.0027661323547363, "loss": 0.7815, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.6340783834457397, "rewards/margins": 0.36868780851364136, "rewards/rejected": -2.0027661323547363, "sft_loss": 1.5978288650512695, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 8.27533099546194, "learning_rate": 2.8520319651934147e-06, "logits/chosen": -0.25640377402305603, "logits/rejected": -0.0609319731593132, "logps/chosen": -1.7309064865112305, "logps/rejected": -2.0606789588928223, "loss": 0.8249, "rewards/accuracies": 0.625, "rewards/chosen": -1.7309064865112305, "rewards/margins": 0.3297719657421112, "rewards/rejected": -2.0606789588928223, "sft_loss": 1.7498449087142944, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 10.994412069006197, "learning_rate": 2.8500019617175005e-06, "logits/chosen": -0.22293055057525635, "logits/rejected": -0.045440297573804855, "logps/chosen": -1.5989841222763062, "logps/rejected": -1.9689960479736328, "loss": 0.7614, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5989841222763062, "rewards/margins": 0.37001150846481323, "rewards/rejected": -1.9689960479736328, "sft_loss": 1.5846779346466064, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 7.880113373200392, "learning_rate": 2.847958860516633e-06, "logits/chosen": -0.32245585322380066, "logits/rejected": -0.14119119942188263, "logps/chosen": -1.7176250219345093, "logps/rejected": -2.05019474029541, "loss": 0.8125, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.7176250219345093, "rewards/margins": 0.3325696587562561, "rewards/rejected": -2.05019474029541, "sft_loss": 1.7187855243682861, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 5.070148969896155, "learning_rate": 2.8459026814129887e-06, "logits/chosen": -0.22318537533283234, "logits/rejected": -0.22121672332286835, "logps/chosen": -1.699710488319397, "logps/rejected": -2.2298336029052734, "loss": 0.7527, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.699710488319397, "rewards/margins": 0.5301231145858765, "rewards/rejected": -2.2298336029052734, "sft_loss": 1.6993358135223389, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 6.5213917284801415, "learning_rate": 2.8438334443556268e-06, "logits/chosen": -0.22244945168495178, "logits/rejected": 0.08072423934936523, "logps/chosen": -1.5851320028305054, "logps/rejected": -2.119144916534424, "loss": 0.7376, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5851320028305054, "rewards/margins": 0.5340126156806946, "rewards/rejected": -2.119144916534424, "sft_loss": 1.6274535655975342, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 5.763934363832557, "learning_rate": 2.8417511694202938e-06, "logits/chosen": -0.02451368421316147, "logits/rejected": 0.03239697217941284, "logps/chosen": -1.5781588554382324, "logps/rejected": -2.0349514484405518, "loss": 0.7633, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5781588554382324, "rewards/margins": 0.45679235458374023, "rewards/rejected": -2.0349514484405518, "sft_loss": 1.5774492025375366, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 9.017956492956781, "learning_rate": 2.83965587680923e-06, "logits/chosen": -0.04484400898218155, "logits/rejected": 0.052457988262176514, "logps/chosen": -1.602957010269165, "logps/rejected": -2.0960071086883545, "loss": 0.7555, "rewards/accuracies": 0.65625, "rewards/chosen": -1.602957010269165, "rewards/margins": 0.4930500090122223, "rewards/rejected": -2.0960071086883545, "sft_loss": 1.6403892040252686, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 6.8452646381985, "learning_rate": 2.837547586850974e-06, "logits/chosen": -0.16252757608890533, "logits/rejected": 0.05827758461236954, "logps/chosen": -1.597303867340088, "logps/rejected": -2.0770814418792725, "loss": 0.7452, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.597303867340088, "rewards/margins": 0.4797777235507965, "rewards/rejected": -2.0770814418792725, "sft_loss": 1.5819872617721558, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 6.745747568412131, "learning_rate": 2.8354263200001645e-06, "logits/chosen": -0.27575942873954773, "logits/rejected": -0.023251216858625412, "logps/chosen": -1.5655136108398438, "logps/rejected": -1.9516299962997437, "loss": 0.768, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5655136108398438, "rewards/margins": 0.3861163854598999, "rewards/rejected": -1.9516299962997437, "sft_loss": 1.6041730642318726, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 8.458459704309899, "learning_rate": 2.8332920968373414e-06, "logits/chosen": -0.05385139584541321, "logits/rejected": 0.08625955879688263, "logps/chosen": -1.6114717721939087, "logps/rejected": -1.9976829290390015, "loss": 0.818, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6114717721939087, "rewards/margins": 0.3862113058567047, "rewards/rejected": -1.9976829290390015, "sft_loss": 1.580669641494751, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 7.909363378162102, "learning_rate": 2.831144938068747e-06, "logits/chosen": -0.08213544636964798, "logits/rejected": 0.07000809907913208, "logps/chosen": -1.5530449151992798, "logps/rejected": -1.940569519996643, "loss": 0.772, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5530449151992798, "rewards/margins": 0.38752469420433044, "rewards/rejected": -1.940569519996643, "sft_loss": 1.5636192560195923, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 7.785721622620933, "learning_rate": 2.8289848645261253e-06, "logits/chosen": -0.07729779183864594, "logits/rejected": 0.04112401232123375, "logps/chosen": -1.6301319599151611, "logps/rejected": -2.0394530296325684, "loss": 0.7538, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6301319599151611, "rewards/margins": 0.40932124853134155, "rewards/rejected": -2.0394530296325684, "sft_loss": 1.6521657705307007, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 8.457543080999693, "learning_rate": 2.826811897166519e-06, "logits/chosen": -0.06580647081136703, "logits/rejected": -0.020910892635583878, "logps/chosen": -1.5988067388534546, "logps/rejected": -1.9672935009002686, "loss": 0.7746, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5988067388534546, "rewards/margins": 0.3684867322444916, "rewards/rejected": -1.9672935009002686, "sft_loss": 1.5970875024795532, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 6.277693217201755, "learning_rate": 2.8246260570720673e-06, "logits/chosen": -0.023764025419950485, "logits/rejected": 0.2361760437488556, "logps/chosen": -1.5750267505645752, "logps/rejected": -2.120636463165283, "loss": 0.7102, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5750267505645752, "rewards/margins": 0.5456094145774841, "rewards/rejected": -2.120636463165283, "sft_loss": 1.624132752418518, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 9.984181907521135, "learning_rate": 2.8224273654498007e-06, "logits/chosen": -0.03629612922668457, "logits/rejected": 0.047414567321538925, "logps/chosen": -1.7118265628814697, "logps/rejected": -1.9818274974822998, "loss": 0.8381, "rewards/accuracies": 0.625, "rewards/chosen": -1.7118265628814697, "rewards/margins": 0.2700011730194092, "rewards/rejected": -1.9818274974822998, "sft_loss": 1.7460143566131592, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 7.1133242656780125, "learning_rate": 2.8202158436314348e-06, "logits/chosen": -0.10063023865222931, "logits/rejected": 0.27981701493263245, "logps/chosen": -1.7411918640136719, "logps/rejected": -2.2518210411071777, "loss": 0.7536, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7411918640136719, "rewards/margins": 0.5106293559074402, "rewards/rejected": -2.2518210411071777, "sft_loss": 1.7656304836273193, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 6.833573511308134, "learning_rate": 2.817991513073163e-06, "logits/chosen": -0.18759292364120483, "logits/rejected": -0.015145483426749706, "logps/chosen": -1.769029974937439, "logps/rejected": -2.3505871295928955, "loss": 0.7549, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.769029974937439, "rewards/margins": 0.5815570950508118, "rewards/rejected": -2.3505871295928955, "sft_loss": 1.8363697528839111, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 4.852027119528442, "learning_rate": 2.8157543953554515e-06, "logits/chosen": -0.10282772779464722, "logits/rejected": 0.0718497559428215, "logps/chosen": -1.6821658611297607, "logps/rejected": -2.2472102642059326, "loss": 0.7334, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6821658611297607, "rewards/margins": 0.5650441646575928, "rewards/rejected": -2.2472102642059326, "sft_loss": 1.7046318054199219, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 18.696305660313005, "learning_rate": 2.813504512182825e-06, "logits/chosen": -0.07099226117134094, "logits/rejected": 0.09033173322677612, "logps/chosen": -1.7145551443099976, "logps/rejected": -2.405233383178711, "loss": 0.7138, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7145551443099976, "rewards/margins": 0.6906784772872925, "rewards/rejected": -2.405233383178711, "sft_loss": 1.7227901220321655, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 5.02265596060493, "learning_rate": 2.811241885383661e-06, "logits/chosen": -0.12921445071697235, "logits/rejected": 0.052244264632463455, "logps/chosen": -1.6103248596191406, "logps/rejected": -2.2286019325256348, "loss": 0.7146, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6103248596191406, "rewards/margins": 0.6182770133018494, "rewards/rejected": -2.2286019325256348, "sft_loss": 1.6762888431549072, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 5.4476182172147585, "learning_rate": 2.8089665369099737e-06, "logits/chosen": -0.14373573660850525, "logits/rejected": 0.03006540611386299, "logps/chosen": -1.6545798778533936, "logps/rejected": -2.0341227054595947, "loss": 0.8217, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.6545798778533936, "rewards/margins": 0.37954264879226685, "rewards/rejected": -2.0341227054595947, "sft_loss": 1.6523634195327759, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 11.388814753067097, "learning_rate": 2.806678488837205e-06, "logits/chosen": -0.1133163794875145, "logits/rejected": 0.06256228685379028, "logps/chosen": -1.5948195457458496, "logps/rejected": -2.063190221786499, "loss": 0.7666, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5948195457458496, "rewards/margins": 0.4683706760406494, "rewards/rejected": -2.063190221786499, "sft_loss": 1.6374050378799438, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 6.862074073482828, "learning_rate": 2.804377763364006e-06, "logits/chosen": -0.016701694577932358, "logits/rejected": 0.14885933697223663, "logps/chosen": -1.7333139181137085, "logps/rejected": -2.1773841381073, "loss": 0.7851, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.7333139181137085, "rewards/margins": 0.44407039880752563, "rewards/rejected": -2.1773841381073, "sft_loss": 1.7752745151519775, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 6.992114465819229, "learning_rate": 2.8020643828120263e-06, "logits/chosen": 0.005483886227011681, "logits/rejected": 0.1073252409696579, "logps/chosen": -1.7122163772583008, "logps/rejected": -2.009945869445801, "loss": 0.8039, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7122163772583008, "rewards/margins": 0.2977294325828552, "rewards/rejected": -2.009945869445801, "sft_loss": 1.707984209060669, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 6.161097410804302, "learning_rate": 2.799738369625694e-06, "logits/chosen": -0.2201133668422699, "logits/rejected": -0.025361087173223495, "logps/chosen": -1.7411420345306396, "logps/rejected": -2.049943447113037, "loss": 0.7949, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.7411420345306396, "rewards/margins": 0.30880117416381836, "rewards/rejected": -2.049943447113037, "sft_loss": 1.7520923614501953, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 5.8230421438631845, "learning_rate": 2.7973997463719993e-06, "logits/chosen": -0.07737503945827484, "logits/rejected": 0.17626357078552246, "logps/chosen": -1.4855806827545166, "logps/rejected": -1.9931854009628296, "loss": 0.7551, "rewards/accuracies": 0.625, "rewards/chosen": -1.4855806827545166, "rewards/margins": 0.507604718208313, "rewards/rejected": -1.9931854009628296, "sft_loss": 1.5371906757354736, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 7.449046474721441, "learning_rate": 2.7950485357402754e-06, "logits/chosen": -0.0965823158621788, "logits/rejected": 0.13526314496994019, "logps/chosen": -1.5863357782363892, "logps/rejected": -1.9766420125961304, "loss": 0.7637, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5863357782363892, "rewards/margins": 0.390306293964386, "rewards/rejected": -1.9766420125961304, "sft_loss": 1.6442428827285767, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 9.240029352233087, "learning_rate": 2.7926847605419776e-06, "logits/chosen": 0.014005353674292564, "logits/rejected": 0.19778305292129517, "logps/chosen": -1.5677052736282349, "logps/rejected": -1.7215681076049805, "loss": 0.8335, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.5677052736282349, "rewards/margins": 0.15386290848255157, "rewards/rejected": -1.7215681076049805, "sft_loss": 1.5489507913589478, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 7.163399271299538, "learning_rate": 2.7903084437104633e-06, "logits/chosen": -0.013630482368171215, "logits/rejected": 0.17022158205509186, "logps/chosen": -1.5031455755233765, "logps/rejected": -2.0510964393615723, "loss": 0.7328, "rewards/accuracies": 0.625, "rewards/chosen": -1.5031455755233765, "rewards/margins": 0.547950804233551, "rewards/rejected": -2.0510964393615723, "sft_loss": 1.5734002590179443, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 6.1120135616231925, "learning_rate": 2.787919608300769e-06, "logits/chosen": -0.031419310718774796, "logits/rejected": 0.11188580840826035, "logps/chosen": -1.590998649597168, "logps/rejected": -2.08305287361145, "loss": 0.7482, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.590998649597168, "rewards/margins": 0.4920540750026703, "rewards/rejected": -2.08305287361145, "sft_loss": 1.6233997344970703, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 9.09816129229086, "learning_rate": 2.785518277489387e-06, "logits/chosen": -0.1941632479429245, "logits/rejected": -0.026421820744872093, "logps/chosen": -1.6917638778686523, "logps/rejected": -2.080397129058838, "loss": 0.7829, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6917638778686523, "rewards/margins": 0.3886331617832184, "rewards/rejected": -2.080397129058838, "sft_loss": 1.698979139328003, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 6.987506345238123, "learning_rate": 2.783104474574038e-06, "logits/chosen": -0.02514765039086342, "logits/rejected": 0.04142391309142113, "logps/chosen": -1.6556907892227173, "logps/rejected": -2.2444541454315186, "loss": 0.7402, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6556907892227173, "rewards/margins": 0.588763415813446, "rewards/rejected": -2.2444541454315186, "sft_loss": 1.7001619338989258, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 5.335480638813095, "learning_rate": 2.7806782229734495e-06, "logits/chosen": -0.1490539163351059, "logits/rejected": -0.008489152416586876, "logps/chosen": -1.6659870147705078, "logps/rejected": -2.0349597930908203, "loss": 0.7977, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.6659870147705078, "rewards/margins": 0.3689727187156677, "rewards/rejected": -2.0349597930908203, "sft_loss": 1.715635061264038, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 10.068039319515478, "learning_rate": 2.7782395462271247e-06, "logits/chosen": -0.21546879410743713, "logits/rejected": 0.06337271630764008, "logps/chosen": -1.651076316833496, "logps/rejected": -2.0089659690856934, "loss": 0.7988, "rewards/accuracies": 0.625, "rewards/chosen": -1.651076316833496, "rewards/margins": 0.3578898012638092, "rewards/rejected": -2.0089659690856934, "sft_loss": 1.7310740947723389, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 7.27339121991531, "learning_rate": 2.7757884679951167e-06, "logits/chosen": -0.03317772597074509, "logits/rejected": 0.07568792998790741, "logps/chosen": -1.6018059253692627, "logps/rejected": -1.968252182006836, "loss": 0.8057, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6018059253692627, "rewards/margins": 0.36644667387008667, "rewards/rejected": -1.968252182006836, "sft_loss": 1.6354246139526367, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 6.348609737780041, "learning_rate": 2.7733250120577967e-06, "logits/chosen": -0.13050243258476257, "logits/rejected": 0.08431114256381989, "logps/chosen": -1.5470638275146484, "logps/rejected": -2.0073821544647217, "loss": 0.7476, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5470638275146484, "rewards/margins": 0.46031856536865234, "rewards/rejected": -2.0073821544647217, "sft_loss": 1.5971641540527344, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 5.868396696875582, "learning_rate": 2.770849202315625e-06, "logits/chosen": -0.11732141673564911, "logits/rejected": 0.11058878898620605, "logps/chosen": -1.561550498008728, "logps/rejected": -2.060333728790283, "loss": 0.7369, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.561550498008728, "rewards/margins": 0.4987828731536865, "rewards/rejected": -2.060333728790283, "sft_loss": 1.5517923831939697, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 6.244212935607668, "learning_rate": 2.768361062788919e-06, "logits/chosen": -0.06370851397514343, "logits/rejected": 0.10079216957092285, "logps/chosen": -1.6988623142242432, "logps/rejected": -2.111222743988037, "loss": 0.7808, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6988623142242432, "rewards/margins": 0.4123605191707611, "rewards/rejected": -2.111222743988037, "sft_loss": 1.7255029678344727, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 6.201364762208477, "learning_rate": 2.7658606176176186e-06, "logits/chosen": -0.12157303094863892, "logits/rejected": -0.08812706172466278, "logps/chosen": -1.6759192943572998, "logps/rejected": -2.098238706588745, "loss": 0.7934, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.6759192943572998, "rewards/margins": 0.4223194122314453, "rewards/rejected": -2.098238706588745, "sft_loss": 1.7222354412078857, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 5.880804483864086, "learning_rate": 2.763347891061054e-06, "logits/chosen": -0.16590115427970886, "logits/rejected": 0.06696538627147675, "logps/chosen": -1.616180658340454, "logps/rejected": -2.0431764125823975, "loss": 0.7687, "rewards/accuracies": 0.625, "rewards/chosen": -1.616180658340454, "rewards/margins": 0.4269959032535553, "rewards/rejected": -2.0431764125823975, "sft_loss": 1.6643760204315186, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 5.8402211520482545, "learning_rate": 2.7608229074977103e-06, "logits/chosen": 0.031211012974381447, "logits/rejected": 0.17653919756412506, "logps/chosen": -1.5481231212615967, "logps/rejected": -2.1435084342956543, "loss": 0.7344, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5481231212615967, "rewards/margins": 0.5953856110572815, "rewards/rejected": -2.1435084342956543, "sft_loss": 1.5866037607192993, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 7.676572876305737, "learning_rate": 2.758285691424988e-06, "logits/chosen": -0.013167837634682655, "logits/rejected": 0.19646652042865753, "logps/chosen": -1.597267508506775, "logps/rejected": -2.110203266143799, "loss": 0.7468, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.597267508506775, "rewards/margins": 0.5129359364509583, "rewards/rejected": -2.110203266143799, "sft_loss": 1.5865503549575806, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 6.985150266196622, "learning_rate": 2.7557362674589687e-06, "logits/chosen": -0.09934534132480621, "logits/rejected": 0.03581539914011955, "logps/chosen": -1.5587165355682373, "logps/rejected": -2.0029618740081787, "loss": 0.7548, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5587165355682373, "rewards/margins": 0.44424518942832947, "rewards/rejected": -2.0029618740081787, "sft_loss": 1.5543994903564453, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 8.482719831775292, "learning_rate": 2.753174660334175e-06, "logits/chosen": -0.07365381717681885, "logits/rejected": 0.04320163652300835, "logps/chosen": -1.8012605905532837, "logps/rejected": -2.065605640411377, "loss": 0.8401, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.8012605905532837, "rewards/margins": 0.26434528827667236, "rewards/rejected": -2.065605640411377, "sft_loss": 1.815129280090332, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 9.343651805064166, "learning_rate": 2.750600894903331e-06, "logits/chosen": -0.1390191912651062, "logits/rejected": -0.007775820791721344, "logps/chosen": -1.685133695602417, "logps/rejected": -2.1161274909973145, "loss": 0.811, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.685133695602417, "rewards/margins": 0.4309937357902527, "rewards/rejected": -2.1161274909973145, "sft_loss": 1.7652587890625, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 6.14800348144563, "learning_rate": 2.7480149961371194e-06, "logits/chosen": -0.028079237788915634, "logits/rejected": 0.07078806310892105, "logps/chosen": -1.6217378377914429, "logps/rejected": -2.157701253890991, "loss": 0.7215, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6217378377914429, "rewards/margins": 0.5359634160995483, "rewards/rejected": -2.157701253890991, "sft_loss": 1.6147205829620361, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 4.329378205807125, "learning_rate": 2.745416989123942e-06, "logits/chosen": -0.07498262822628021, "logits/rejected": 0.3165915608406067, "logps/chosen": -1.6001355648040771, "logps/rejected": -2.1155784130096436, "loss": 0.7218, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6001355648040771, "rewards/margins": 0.5154424905776978, "rewards/rejected": -2.1155784130096436, "sft_loss": 1.60519278049469, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 6.007531073170756, "learning_rate": 2.7428068990696735e-06, "logits/chosen": 0.04040871933102608, "logits/rejected": 0.12215709686279297, "logps/chosen": -1.5383670330047607, "logps/rejected": -1.989495038986206, "loss": 0.7401, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5383670330047607, "rewards/margins": 0.45112818479537964, "rewards/rejected": -1.989495038986206, "sft_loss": 1.5585277080535889, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 6.64268477918211, "learning_rate": 2.7401847512974194e-06, "logits/chosen": 0.011279207654297352, "logits/rejected": 0.12318231165409088, "logps/chosen": -1.601244568824768, "logps/rejected": -2.068206310272217, "loss": 0.7514, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.601244568824768, "rewards/margins": 0.46696168184280396, "rewards/rejected": -2.068206310272217, "sft_loss": 1.7155154943466187, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 5.734341288848589, "learning_rate": 2.7375505712472695e-06, "logits/chosen": -0.02312772534787655, "logits/rejected": 0.23983144760131836, "logps/chosen": -1.6255518198013306, "logps/rejected": -2.0686745643615723, "loss": 0.7814, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6255518198013306, "rewards/margins": 0.4431230425834656, "rewards/rejected": -2.0686745643615723, "sft_loss": 1.6115245819091797, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 10.20412258403517, "learning_rate": 2.734904384476049e-06, "logits/chosen": -0.01249808631837368, "logits/rejected": 0.1528748869895935, "logps/chosen": -1.6915109157562256, "logps/rejected": -2.112687587738037, "loss": 0.7771, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.6915109157562256, "rewards/margins": 0.4211767613887787, "rewards/rejected": -2.112687587738037, "sft_loss": 1.6461522579193115, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 7.623823205688982, "learning_rate": 2.732246216657075e-06, "logits/chosen": -0.01207329798489809, "logits/rejected": 0.24696707725524902, "logps/chosen": -1.6187397241592407, "logps/rejected": -2.0573651790618896, "loss": 0.7527, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6187397241592407, "rewards/margins": 0.43862539529800415, "rewards/rejected": -2.0573651790618896, "sft_loss": 1.648451805114746, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 8.944893339121828, "learning_rate": 2.729576093579902e-06, "logits/chosen": 0.03348899632692337, "logits/rejected": 0.25046640634536743, "logps/chosen": -1.6148052215576172, "logps/rejected": -2.2734577655792236, "loss": 0.7025, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6148052215576172, "rewards/margins": 0.6586524248123169, "rewards/rejected": -2.2734577655792236, "sft_loss": 1.6397300958633423, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 6.307415239596535, "learning_rate": 2.726894041150077e-06, "logits/chosen": 0.03806239366531372, "logits/rejected": 0.25313812494277954, "logps/chosen": -1.619058609008789, "logps/rejected": -2.052539348602295, "loss": 0.7735, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.619058609008789, "rewards/margins": 0.4334811270236969, "rewards/rejected": -2.052539348602295, "sft_loss": 1.6786972284317017, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 7.4342369032944555, "learning_rate": 2.7242000853888833e-06, "logits/chosen": -0.17027874290943146, "logits/rejected": 0.17580825090408325, "logps/chosen": -1.7059392929077148, "logps/rejected": -2.3058700561523438, "loss": 0.7296, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7059392929077148, "rewards/margins": 0.5999307036399841, "rewards/rejected": -2.3058700561523438, "sft_loss": 1.739965796470642, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 7.867776592990231, "learning_rate": 2.7214942524330918e-06, "logits/chosen": -0.1580534279346466, "logits/rejected": 0.19921769201755524, "logps/chosen": -1.7531449794769287, "logps/rejected": -2.4730262756347656, "loss": 0.7485, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7531449794769287, "rewards/margins": 0.7198811769485474, "rewards/rejected": -2.4730262756347656, "sft_loss": 1.7198894023895264, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 8.211176392099416, "learning_rate": 2.7187765685347063e-06, "logits/chosen": -0.014283919706940651, "logits/rejected": 0.07657746970653534, "logps/chosen": -1.8897743225097656, "logps/rejected": -2.3584890365600586, "loss": 0.8054, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.8897743225097656, "rewards/margins": 0.46871480345726013, "rewards/rejected": -2.3584890365600586, "sft_loss": 1.9435539245605469, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 6.126682030455825, "learning_rate": 2.7160470600607076e-06, "logits/chosen": -0.01688101515173912, "logits/rejected": 0.08603396266698837, "logps/chosen": -1.7661564350128174, "logps/rejected": -2.2313408851623535, "loss": 0.7853, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7661564350128174, "rewards/margins": 0.46518421173095703, "rewards/rejected": -2.2313408851623535, "sft_loss": 1.8172565698623657, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 8.24892072870615, "learning_rate": 2.7133057534927986e-06, "logits/chosen": 0.03328476846218109, "logits/rejected": 0.078876793384552, "logps/chosen": -1.6054766178131104, "logps/rejected": -1.9629688262939453, "loss": 0.7886, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6054766178131104, "rewards/margins": 0.3574923872947693, "rewards/rejected": -1.9629688262939453, "sft_loss": 1.6771936416625977, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 5.792199888523707, "learning_rate": 2.710552675427148e-06, "logits/chosen": 0.02124195173382759, "logits/rejected": 0.15871360898017883, "logps/chosen": -1.5020825862884521, "logps/rejected": -1.7976001501083374, "loss": 0.7749, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5020825862884521, "rewards/margins": 0.2955175042152405, "rewards/rejected": -1.7976001501083374, "sft_loss": 1.5216575860977173, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 7.840732618913243, "learning_rate": 2.707787852574131e-06, "logits/chosen": 0.045780718326568604, "logits/rejected": 0.37935394048690796, "logps/chosen": -1.541059970855713, "logps/rejected": -1.887292504310608, "loss": 0.7606, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.541059970855713, "rewards/margins": 0.3462323546409607, "rewards/rejected": -1.887292504310608, "sft_loss": 1.5794686079025269, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 5.1328056249816685, "learning_rate": 2.7050113117580716e-06, "logits/chosen": 0.0036704824306070805, "logits/rejected": 0.23880800604820251, "logps/chosen": -1.5183892250061035, "logps/rejected": -1.9667367935180664, "loss": 0.7293, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5183892250061035, "rewards/margins": 0.44834762811660767, "rewards/rejected": -1.9667367935180664, "sft_loss": 1.5417373180389404, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 8.270899666456597, "learning_rate": 2.70222307991698e-06, "logits/chosen": -0.07949081808328629, "logits/rejected": 0.018253307789564133, "logps/chosen": -1.5519993305206299, "logps/rejected": -1.8914324045181274, "loss": 0.7772, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5519993305206299, "rewards/margins": 0.3394331634044647, "rewards/rejected": -1.8914324045181274, "sft_loss": 1.6139189004898071, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 8.178223146247955, "learning_rate": 2.6994231841022947e-06, "logits/chosen": 0.028050830587744713, "logits/rejected": 0.12468210607767105, "logps/chosen": -1.7248175144195557, "logps/rejected": -1.9740760326385498, "loss": 0.8437, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.7248175144195557, "rewards/margins": 0.24925847351551056, "rewards/rejected": -1.9740760326385498, "sft_loss": 1.7152729034423828, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 5.537793636346068, "learning_rate": 2.6966116514786166e-06, "logits/chosen": -0.11753810942173004, "logits/rejected": 0.15751810371875763, "logps/chosen": -1.6346890926361084, "logps/rejected": -2.2291836738586426, "loss": 0.7152, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6346890926361084, "rewards/margins": 0.5944945216178894, "rewards/rejected": -2.2291836738586426, "sft_loss": 1.6563717126846313, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": 0.46628129482269287, "eval_logits/rejected": 0.6004635691642761, "eval_logps/chosen": -1.6570273637771606, "eval_logps/rejected": -2.1579315662384033, "eval_loss": 0.7451155781745911, "eval_rewards/accuracies": 0.6795251965522766, "eval_rewards/chosen": -1.6570273637771606, "eval_rewards/margins": 0.5009041428565979, "eval_rewards/rejected": -2.1579315662384033, "eval_runtime": 44.5394, "eval_samples_per_second": 30.198, "eval_sft_loss": 1.6680430173873901, "eval_steps_per_second": 7.566, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 6.601498318061748, "learning_rate": 2.6937885093234477e-06, "logits/chosen": -0.11071095615625381, "logits/rejected": 0.22233088314533234, "logps/chosen": -1.6684538125991821, "logps/rejected": -2.2091896533966064, "loss": 0.7292, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6684538125991821, "rewards/margins": 0.5407360792160034, "rewards/rejected": -2.2091896533966064, "sft_loss": 1.6963183879852295, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 6.60084492891464, "learning_rate": 2.6909537850269256e-06, "logits/chosen": -0.10238151252269745, "logits/rejected": 0.16353382170200348, "logps/chosen": -1.6431251764297485, "logps/rejected": -2.2469396591186523, "loss": 0.7289, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6431251764297485, "rewards/margins": 0.6038146018981934, "rewards/rejected": -2.2469396591186523, "sft_loss": 1.7117202281951904, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 10.47007979184553, "learning_rate": 2.688107506091558e-06, "logits/chosen": 0.021958637982606888, "logits/rejected": 0.21035261452198029, "logps/chosen": -1.7854686975479126, "logps/rejected": -2.2965052127838135, "loss": 0.8082, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7854686975479126, "rewards/margins": 0.5110365152359009, "rewards/rejected": -2.2965052127838135, "sft_loss": 1.8058786392211914, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 6.514639401513543, "learning_rate": 2.6852497001319555e-06, "logits/chosen": 0.053987015038728714, "logits/rejected": 0.2950461804866791, "logps/chosen": -1.567812204360962, "logps/rejected": -2.0840001106262207, "loss": 0.7485, "rewards/accuracies": 0.65625, "rewards/chosen": -1.567812204360962, "rewards/margins": 0.5161879658699036, "rewards/rejected": -2.0840001106262207, "sft_loss": 1.611684799194336, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 8.313247489947376, "learning_rate": 2.682380394874564e-06, "logits/chosen": 0.1495714783668518, "logits/rejected": 0.21326008439064026, "logps/chosen": -1.6688334941864014, "logps/rejected": -1.993452787399292, "loss": 0.8008, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.6688334941864014, "rewards/margins": 0.32461923360824585, "rewards/rejected": -1.993452787399292, "sft_loss": 1.6152689456939697, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 6.079190187498352, "learning_rate": 2.6794996181573953e-06, "logits/chosen": 0.08869186043739319, "logits/rejected": 0.32504457235336304, "logps/chosen": -1.5763978958129883, "logps/rejected": -1.9891843795776367, "loss": 0.7714, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5763978958129883, "rewards/margins": 0.41278642416000366, "rewards/rejected": -1.9891843795776367, "sft_loss": 1.5958096981048584, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 7.231629336566373, "learning_rate": 2.6766073979297584e-06, "logits/chosen": -0.017764057964086533, "logits/rejected": 0.1814512312412262, "logps/chosen": -1.5102531909942627, "logps/rejected": -2.041884422302246, "loss": 0.7378, "rewards/accuracies": 0.625, "rewards/chosen": -1.5102531909942627, "rewards/margins": 0.5316312909126282, "rewards/rejected": -2.041884422302246, "sft_loss": 1.5394649505615234, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 6.9019460717918895, "learning_rate": 2.6737037622519866e-06, "logits/chosen": 0.023942243307828903, "logits/rejected": 0.23042920231819153, "logps/chosen": -1.5446743965148926, "logps/rejected": -2.143510580062866, "loss": 0.7289, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5446743965148926, "rewards/margins": 0.5988361239433289, "rewards/rejected": -2.143510580062866, "sft_loss": 1.5816099643707275, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 7.135847127478541, "learning_rate": 2.670788739295166e-06, "logits/chosen": 0.11366844177246094, "logits/rejected": 0.2004258632659912, "logps/chosen": -1.556762456893921, "logps/rejected": -1.9616931676864624, "loss": 0.7568, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.556762456893921, "rewards/margins": 0.40493065118789673, "rewards/rejected": -1.9616931676864624, "sft_loss": 1.5798929929733276, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 8.48477994398459, "learning_rate": 2.6678623573408613e-06, "logits/chosen": 0.19071733951568604, "logits/rejected": 0.287414014339447, "logps/chosen": -1.592995524406433, "logps/rejected": -2.1026222705841064, "loss": 0.7272, "rewards/accuracies": 0.6875, "rewards/chosen": -1.592995524406433, "rewards/margins": 0.5096268057823181, "rewards/rejected": -2.1026222705841064, "sft_loss": 1.5550458431243896, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 7.666285597766726, "learning_rate": 2.664924644780844e-06, "logits/chosen": 0.09412811696529388, "logits/rejected": 0.25759047269821167, "logps/chosen": -1.665856957435608, "logps/rejected": -2.1644914150238037, "loss": 0.7627, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.665856957435608, "rewards/margins": 0.498634397983551, "rewards/rejected": -2.1644914150238037, "sft_loss": 1.6397167444229126, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 8.385920510432712, "learning_rate": 2.661975630116813e-06, "logits/chosen": 0.15583749115467072, "logits/rejected": 0.21677199006080627, "logps/chosen": -1.567179560661316, "logps/rejected": -2.111881971359253, "loss": 0.7222, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.567179560661316, "rewards/margins": 0.544702410697937, "rewards/rejected": -2.111881971359253, "sft_loss": 1.5149648189544678, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 7.318357315401352, "learning_rate": 2.6590153419601236e-06, "logits/chosen": 0.17675212025642395, "logits/rejected": 0.273038387298584, "logps/chosen": -1.7771793603897095, "logps/rejected": -2.1778042316436768, "loss": 0.8118, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7771793603897095, "rewards/margins": 0.40062466263771057, "rewards/rejected": -2.1778042316436768, "sft_loss": 1.7589333057403564, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 6.249906925635093, "learning_rate": 2.656043809031503e-06, "logits/chosen": 0.15232793986797333, "logits/rejected": 0.41597262024879456, "logps/chosen": -1.7666094303131104, "logps/rejected": -2.173813819885254, "loss": 0.8284, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.7666094303131104, "rewards/margins": 0.4072045683860779, "rewards/rejected": -2.173813819885254, "sft_loss": 1.6762611865997314, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 8.986403126194638, "learning_rate": 2.6530610601607764e-06, "logits/chosen": 0.1481233537197113, "logits/rejected": 0.4590158462524414, "logps/chosen": -1.7123301029205322, "logps/rejected": -2.2884089946746826, "loss": 0.7548, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7123301029205322, "rewards/margins": 0.5760786533355713, "rewards/rejected": -2.2884089946746826, "sft_loss": 1.7525371313095093, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 7.50493689044621, "learning_rate": 2.6500671242865877e-06, "logits/chosen": 0.04240292310714722, "logits/rejected": 0.22901475429534912, "logps/chosen": -1.7242082357406616, "logps/rejected": -2.1248300075531006, "loss": 0.7762, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.7242082357406616, "rewards/margins": 0.40062183141708374, "rewards/rejected": -2.1248300075531006, "sft_loss": 1.7412922382354736, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 7.410719554486841, "learning_rate": 2.6470620304561147e-06, "logits/chosen": 0.1224837675690651, "logits/rejected": 0.4860759675502777, "logps/chosen": -1.6327342987060547, "logps/rejected": -2.1257483959198, "loss": 0.7689, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6327342987060547, "rewards/margins": 0.4930140972137451, "rewards/rejected": -2.1257483959198, "sft_loss": 1.6574312448501587, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 7.020214850908995, "learning_rate": 2.6440458078247914e-06, "logits/chosen": 0.14286457002162933, "logits/rejected": 0.45216941833496094, "logps/chosen": -1.5427910089492798, "logps/rejected": -2.09596586227417, "loss": 0.7161, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5427910089492798, "rewards/margins": 0.553174614906311, "rewards/rejected": -2.09596586227417, "sft_loss": 1.6083276271820068, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 11.83409335374771, "learning_rate": 2.641018485656023e-06, "logits/chosen": 0.004470625426620245, "logits/rejected": 0.2434651106595993, "logps/chosen": -1.6732524633407593, "logps/rejected": -2.092607021331787, "loss": 0.7896, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6732524633407593, "rewards/margins": 0.41935428977012634, "rewards/rejected": -2.092607021331787, "sft_loss": 1.71701180934906, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 6.48895966929563, "learning_rate": 2.6379800933209028e-06, "logits/chosen": 0.2479771375656128, "logits/rejected": 0.1528288722038269, "logps/chosen": -1.6353261470794678, "logps/rejected": -1.9076725244522095, "loss": 0.8118, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6353261470794678, "rewards/margins": 0.2723463475704193, "rewards/rejected": -1.9076725244522095, "sft_loss": 1.6776100397109985, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 8.304131930398496, "learning_rate": 2.634930660297926e-06, "logits/chosen": 0.2298423945903778, "logits/rejected": 0.45017942786216736, "logps/chosen": -1.585171103477478, "logps/rejected": -2.002087116241455, "loss": 0.7519, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.585171103477478, "rewards/margins": 0.4169161915779114, "rewards/rejected": -2.002087116241455, "sft_loss": 1.6092761754989624, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 7.157798481884509, "learning_rate": 2.631870216172705e-06, "logits/chosen": 0.20391225814819336, "logits/rejected": 0.35752061009407043, "logps/chosen": -1.6702439785003662, "logps/rejected": -2.085132122039795, "loss": 0.7718, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6702439785003662, "rewards/margins": 0.4148883819580078, "rewards/rejected": -2.085132122039795, "sft_loss": 1.7260020971298218, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 13.800397931675189, "learning_rate": 2.6287987906376834e-06, "logits/chosen": 0.18047472834587097, "logits/rejected": 0.4845646023750305, "logps/chosen": -1.7837867736816406, "logps/rejected": -2.2137465476989746, "loss": 0.8152, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.7837867736816406, "rewards/margins": 0.42995983362197876, "rewards/rejected": -2.2137465476989746, "sft_loss": 1.7706577777862549, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 7.729179220447571, "learning_rate": 2.6257164134918435e-06, "logits/chosen": 0.22672733664512634, "logits/rejected": 0.3549592196941376, "logps/chosen": -1.5702648162841797, "logps/rejected": -2.2024621963500977, "loss": 0.7135, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5702648162841797, "rewards/margins": 0.6321974992752075, "rewards/rejected": -2.2024621963500977, "sft_loss": 1.6143814325332642, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 4.679694118516009, "learning_rate": 2.622623114640423e-06, "logits/chosen": 0.28641635179519653, "logits/rejected": 0.4558026194572449, "logps/chosen": -1.6455614566802979, "logps/rejected": -2.2941269874572754, "loss": 0.7082, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6455614566802979, "rewards/margins": 0.6485655903816223, "rewards/rejected": -2.2941269874572754, "sft_loss": 1.7281744480133057, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 6.28941507290267, "learning_rate": 2.6195189240946205e-06, "logits/chosen": 0.32530251145362854, "logits/rejected": 0.41650062799453735, "logps/chosen": -1.6033023595809937, "logps/rejected": -1.9561445713043213, "loss": 0.7921, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6033023595809937, "rewards/margins": 0.35284224152565, "rewards/rejected": -1.9561445713043213, "sft_loss": 1.6567922830581665, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 6.87768035184655, "learning_rate": 2.6164038719713065e-06, "logits/chosen": 0.011040126904845238, "logits/rejected": 0.3350133001804352, "logps/chosen": -1.5925350189208984, "logps/rejected": -2.326432228088379, "loss": 0.7131, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5925350189208984, "rewards/margins": 0.73389732837677, "rewards/rejected": -2.326432228088379, "sft_loss": 1.5929383039474487, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 9.636323633968454, "learning_rate": 2.6132779884927303e-06, "logits/chosen": 0.07610784471035004, "logits/rejected": 0.3584909737110138, "logps/chosen": -1.6358665227890015, "logps/rejected": -2.1494953632354736, "loss": 0.7415, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6358665227890015, "rewards/margins": 0.5136286616325378, "rewards/rejected": -2.1494953632354736, "sft_loss": 1.5950847864151, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 7.027797013049016, "learning_rate": 2.6101413039862274e-06, "logits/chosen": 0.23933513462543488, "logits/rejected": 0.28316718339920044, "logps/chosen": -1.6864540576934814, "logps/rejected": -2.1790852546691895, "loss": 0.7679, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6864540576934814, "rewards/margins": 0.49263089895248413, "rewards/rejected": -2.1790852546691895, "sft_loss": 1.7424131631851196, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 12.721083258187122, "learning_rate": 2.606993848883924e-06, "logits/chosen": 0.09545941650867462, "logits/rejected": 0.19137129187583923, "logps/chosen": -1.8484070301055908, "logps/rejected": -2.3486435413360596, "loss": 0.8024, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.8484070301055908, "rewards/margins": 0.5002365708351135, "rewards/rejected": -2.3486435413360596, "sft_loss": 1.8460218906402588, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 8.609108934546553, "learning_rate": 2.6038356537224433e-06, "logits/chosen": 0.06818292289972305, "logits/rejected": 0.2434753179550171, "logps/chosen": -1.664947509765625, "logps/rejected": -2.1902339458465576, "loss": 0.7376, "rewards/accuracies": 0.71875, "rewards/chosen": -1.664947509765625, "rewards/margins": 0.5252864956855774, "rewards/rejected": -2.1902339458465576, "sft_loss": 1.6970077753067017, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 7.1753681171926935, "learning_rate": 2.6006667491426098e-06, "logits/chosen": 0.09856332838535309, "logits/rejected": 0.31549859046936035, "logps/chosen": -1.640303373336792, "logps/rejected": -2.1187996864318848, "loss": 0.7552, "rewards/accuracies": 0.65625, "rewards/chosen": -1.640303373336792, "rewards/margins": 0.47849640250205994, "rewards/rejected": -2.1187996864318848, "sft_loss": 1.704172134399414, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 9.577984645970636, "learning_rate": 2.5974871658891483e-06, "logits/chosen": 0.15941819548606873, "logits/rejected": 0.21040339767932892, "logps/chosen": -1.6682817935943604, "logps/rejected": -2.1734302043914795, "loss": 0.7621, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6682817935943604, "rewards/margins": 0.5051483511924744, "rewards/rejected": -2.1734302043914795, "sft_loss": 1.6816104650497437, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 7.484776619651517, "learning_rate": 2.59429693481039e-06, "logits/chosen": 0.1602596491575241, "logits/rejected": 0.38191017508506775, "logps/chosen": -1.6603100299835205, "logps/rejected": -2.0156192779541016, "loss": 0.787, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6603100299835205, "rewards/margins": 0.355309396982193, "rewards/rejected": -2.0156192779541016, "sft_loss": 1.7396914958953857, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 5.852626179114656, "learning_rate": 2.5910960868579707e-06, "logits/chosen": 0.08808886259794235, "logits/rejected": 0.24856379628181458, "logps/chosen": -1.573021411895752, "logps/rejected": -2.024200916290283, "loss": 0.748, "rewards/accuracies": 0.65625, "rewards/chosen": -1.573021411895752, "rewards/margins": 0.451179563999176, "rewards/rejected": -2.024200916290283, "sft_loss": 1.6066830158233643, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 5.772519203781361, "learning_rate": 2.5878846530865316e-06, "logits/chosen": 0.04618818685412407, "logits/rejected": 0.22156529128551483, "logps/chosen": -1.5426143407821655, "logps/rejected": -1.9907680749893188, "loss": 0.7462, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5426143407821655, "rewards/margins": 0.4481539726257324, "rewards/rejected": -1.9907680749893188, "sft_loss": 1.5462530851364136, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 6.2206325351188125, "learning_rate": 2.584662664653417e-06, "logits/chosen": 0.1688314825296402, "logits/rejected": 0.2749001383781433, "logps/chosen": -1.5096161365509033, "logps/rejected": -1.8245433568954468, "loss": 0.7684, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5096161365509033, "rewards/margins": 0.31492722034454346, "rewards/rejected": -1.8245433568954468, "sft_loss": 1.5106897354125977, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 6.730515249486428, "learning_rate": 2.5814301528183724e-06, "logits/chosen": 0.2601849436759949, "logits/rejected": 0.36312350630760193, "logps/chosen": -1.6283996105194092, "logps/rejected": -1.9735686779022217, "loss": 0.7775, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6283996105194092, "rewards/margins": 0.3451687693595886, "rewards/rejected": -1.9735686779022217, "sft_loss": 1.647060751914978, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 7.436456784173762, "learning_rate": 2.5781871489432425e-06, "logits/chosen": 0.04569276422262192, "logits/rejected": 0.25311240553855896, "logps/chosen": -1.5339930057525635, "logps/rejected": -2.056906223297119, "loss": 0.7323, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5339930057525635, "rewards/margins": 0.5229132175445557, "rewards/rejected": -2.056906223297119, "sft_loss": 1.5986318588256836, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 6.760066420168773, "learning_rate": 2.5749336844916644e-06, "logits/chosen": 0.04725943133234978, "logits/rejected": 0.14323343336582184, "logps/chosen": -1.6129106283187866, "logps/rejected": -2.0528836250305176, "loss": 0.7555, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6129106283187866, "rewards/margins": 0.43997272849082947, "rewards/rejected": -2.0528836250305176, "sft_loss": 1.6782392263412476, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 9.43713410579148, "learning_rate": 2.5716697910287653e-06, "logits/chosen": -0.09104229509830475, "logits/rejected": 0.10284809023141861, "logps/chosen": -1.605081558227539, "logps/rejected": -2.299532890319824, "loss": 0.695, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.605081558227539, "rewards/margins": 0.6944512128829956, "rewards/rejected": -2.299532890319824, "sft_loss": 1.6521618366241455, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 5.794534536309435, "learning_rate": 2.5683955002208533e-06, "logits/chosen": 0.1189044937491417, "logits/rejected": 0.3134990334510803, "logps/chosen": -1.6299169063568115, "logps/rejected": -2.1406655311584473, "loss": 0.7279, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6299169063568115, "rewards/margins": 0.5107485055923462, "rewards/rejected": -2.1406655311584473, "sft_loss": 1.6376746892929077, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 8.353254849903834, "learning_rate": 2.5651108438351125e-06, "logits/chosen": 0.058274924755096436, "logits/rejected": 0.3000137209892273, "logps/chosen": -1.657879114151001, "logps/rejected": -2.141011953353882, "loss": 0.7511, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.657879114151001, "rewards/margins": 0.48313283920288086, "rewards/rejected": -2.141011953353882, "sft_loss": 1.7078145742416382, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 9.533470110124561, "learning_rate": 2.5618158537392933e-06, "logits/chosen": 0.09321188926696777, "logits/rejected": 0.18731586635112762, "logps/chosen": -1.7326152324676514, "logps/rejected": -2.2076475620269775, "loss": 0.7547, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7326152324676514, "rewards/margins": 0.475032240152359, "rewards/rejected": -2.2076475620269775, "sft_loss": 1.6305125951766968, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 7.706042556473839, "learning_rate": 2.5585105619014042e-06, "logits/chosen": -0.011135217733681202, "logits/rejected": 0.25493383407592773, "logps/chosen": -1.6696735620498657, "logps/rejected": -2.350343942642212, "loss": 0.726, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6696735620498657, "rewards/margins": 0.6806705594062805, "rewards/rejected": -2.350343942642212, "sft_loss": 1.7117812633514404, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 5.795157279285816, "learning_rate": 2.555195000389401e-06, "logits/chosen": 0.1898316591978073, "logits/rejected": 0.2543187737464905, "logps/chosen": -1.7861602306365967, "logps/rejected": -2.1704885959625244, "loss": 0.7909, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.7861602306365967, "rewards/margins": 0.38432830572128296, "rewards/rejected": -2.1704885959625244, "sft_loss": 1.7805356979370117, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 8.683569554952136, "learning_rate": 2.5518692013708764e-06, "logits/chosen": 0.04368092864751816, "logits/rejected": 0.16964897513389587, "logps/chosen": -1.7581827640533447, "logps/rejected": -2.094151020050049, "loss": 0.7886, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7581827640533447, "rewards/margins": 0.33596810698509216, "rewards/rejected": -2.094151020050049, "sft_loss": 1.794694185256958, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 7.225102717341863, "learning_rate": 2.5485331971127467e-06, "logits/chosen": 0.07228956371545792, "logits/rejected": 0.2951813042163849, "logps/chosen": -1.7531318664550781, "logps/rejected": -2.31132435798645, "loss": 0.7368, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7531318664550781, "rewards/margins": 0.5581925511360168, "rewards/rejected": -2.31132435798645, "sft_loss": 1.783738374710083, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 12.923394962710496, "learning_rate": 2.5451870199809398e-06, "logits/chosen": 0.10603566467761993, "logits/rejected": 0.23745813965797424, "logps/chosen": -1.676622748374939, "logps/rejected": -2.164050817489624, "loss": 0.7748, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.676622748374939, "rewards/margins": 0.4874279499053955, "rewards/rejected": -2.164050817489624, "sft_loss": 1.7047998905181885, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 10.201943390793542, "learning_rate": 2.5418307024400808e-06, "logits/chosen": -0.11501812934875488, "logits/rejected": 0.07067526131868362, "logps/chosen": -1.8136136531829834, "logps/rejected": -2.133671522140503, "loss": 0.8336, "rewards/accuracies": 0.59375, "rewards/chosen": -1.8136136531829834, "rewards/margins": 0.32005780935287476, "rewards/rejected": -2.133671522140503, "sft_loss": 1.7607166767120361, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 9.19390158695412, "learning_rate": 2.538464277053178e-06, "logits/chosen": 0.030479732900857925, "logits/rejected": 0.22223253548145294, "logps/chosen": -1.6502141952514648, "logps/rejected": -2.242980480194092, "loss": 0.753, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6502141952514648, "rewards/margins": 0.5927663445472717, "rewards/rejected": -2.242980480194092, "sft_loss": 1.6727278232574463, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 9.843198268142263, "learning_rate": 2.5350877764813042e-06, "logits/chosen": 0.046414703130722046, "logits/rejected": 0.1939028948545456, "logps/chosen": -1.8641245365142822, "logps/rejected": -2.352881908416748, "loss": 0.7795, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8641245365142822, "rewards/margins": 0.48875731229782104, "rewards/rejected": -2.352881908416748, "sft_loss": 1.8196094036102295, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 7.2829821078345045, "learning_rate": 2.531701233483284e-06, "logits/chosen": 0.020082779228687286, "logits/rejected": 0.1269410252571106, "logps/chosen": -1.678328514099121, "logps/rejected": -2.256639003753662, "loss": 0.7472, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.678328514099121, "rewards/margins": 0.5783103704452515, "rewards/rejected": -2.256639003753662, "sft_loss": 1.707733154296875, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 7.79692306989129, "learning_rate": 2.5283046809153708e-06, "logits/chosen": -0.010091030970215797, "logits/rejected": 0.2002343237400055, "logps/chosen": -1.7502202987670898, "logps/rejected": -2.349459409713745, "loss": 0.7276, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7502202987670898, "rewards/margins": 0.5992392301559448, "rewards/rejected": -2.349459409713745, "sft_loss": 1.7806326150894165, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 8.722278298424193, "learning_rate": 2.524898151730934e-06, "logits/chosen": -0.06084078550338745, "logits/rejected": 0.14076320827007294, "logps/chosen": -1.6607754230499268, "logps/rejected": -2.188734292984009, "loss": 0.727, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6607754230499268, "rewards/margins": 0.5279589891433716, "rewards/rejected": -2.188734292984009, "sft_loss": 1.6254417896270752, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 9.050633472129125, "learning_rate": 2.5214816789801337e-06, "logits/chosen": 0.08807148039340973, "logits/rejected": 0.3059306740760803, "logps/chosen": -1.6033060550689697, "logps/rejected": -2.382601737976074, "loss": 0.6675, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6033060550689697, "rewards/margins": 0.7792957425117493, "rewards/rejected": -2.382601737976074, "sft_loss": 1.6539281606674194, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 6.391447041119939, "learning_rate": 2.518055295809604e-06, "logits/chosen": 0.034144409000873566, "logits/rejected": 0.13075736165046692, "logps/chosen": -1.551403284072876, "logps/rejected": -2.2758407592773438, "loss": 0.6782, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.551403284072876, "rewards/margins": 0.7244374752044678, "rewards/rejected": -2.2758407592773438, "sft_loss": 1.5701818466186523, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 7.35968853019176, "learning_rate": 2.5146190354621295e-06, "logits/chosen": -0.05034959316253662, "logits/rejected": 0.23992769420146942, "logps/chosen": -1.6645405292510986, "logps/rejected": -2.3523929119110107, "loss": 0.6936, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6645405292510986, "rewards/margins": 0.6878524422645569, "rewards/rejected": -2.3523929119110107, "sft_loss": 1.8230438232421875, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 9.065484153335452, "learning_rate": 2.511172931276323e-06, "logits/chosen": 0.0478929728269577, "logits/rejected": 0.11166934669017792, "logps/chosen": -1.6970382928848267, "logps/rejected": -2.2658262252807617, "loss": 0.7139, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6970382928848267, "rewards/margins": 0.5687879323959351, "rewards/rejected": -2.2658262252807617, "sft_loss": 1.7181049585342407, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 6.836928685094494, "learning_rate": 2.5077170166863026e-06, "logits/chosen": -0.17804375290870667, "logits/rejected": 0.23447522521018982, "logps/chosen": -1.707627534866333, "logps/rejected": -2.4188055992126465, "loss": 0.7031, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.707627534866333, "rewards/margins": 0.711178183555603, "rewards/rejected": -2.4188055992126465, "sft_loss": 1.7988073825836182, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 5.74765813055918, "learning_rate": 2.504251325221366e-06, "logits/chosen": -0.00038936137570999563, "logits/rejected": 0.23388922214508057, "logps/chosen": -1.734164834022522, "logps/rejected": -2.2840001583099365, "loss": 0.7582, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.734164834022522, "rewards/margins": 0.5498352646827698, "rewards/rejected": -2.2840001583099365, "sft_loss": 1.7423995733261108, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 6.630794066105069, "learning_rate": 2.500775890505668e-06, "logits/chosen": -0.14226017892360687, "logits/rejected": 0.04709906131029129, "logps/chosen": -1.6069732904434204, "logps/rejected": -2.14501690864563, "loss": 0.7097, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6069732904434204, "rewards/margins": 0.5380436182022095, "rewards/rejected": -2.14501690864563, "sft_loss": 1.659594178199768, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 5.908499740183493, "learning_rate": 2.497290746257891e-06, "logits/chosen": -0.05062161758542061, "logits/rejected": 0.07945103198289871, "logps/chosen": -1.5502407550811768, "logps/rejected": -2.061922073364258, "loss": 0.7352, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5502407550811768, "rewards/margins": 0.5116813778877258, "rewards/rejected": -2.061922073364258, "sft_loss": 1.616845726966858, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 5.940899205934954, "learning_rate": 2.49379592629092e-06, "logits/chosen": -0.09226035326719284, "logits/rejected": 0.03236633166670799, "logps/chosen": -1.425779938697815, "logps/rejected": -2.10670804977417, "loss": 0.647, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.425779938697815, "rewards/margins": 0.6809282302856445, "rewards/rejected": -2.10670804977417, "sft_loss": 1.4918690919876099, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 9.575546027123126, "learning_rate": 2.4902914645115135e-06, "logits/chosen": -0.24684646725654602, "logits/rejected": 0.06735511124134064, "logps/chosen": -1.6334794759750366, "logps/rejected": -2.2642669677734375, "loss": 0.6957, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6334794759750366, "rewards/margins": 0.6307875514030457, "rewards/rejected": -2.2642669677734375, "sft_loss": 1.6950395107269287, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 9.858340363293193, "learning_rate": 2.4867773949199748e-06, "logits/chosen": -0.17375081777572632, "logits/rejected": 0.04624699801206589, "logps/chosen": -1.605721116065979, "logps/rejected": -2.3855223655700684, "loss": 0.6545, "rewards/accuracies": 0.75, "rewards/chosen": -1.605721116065979, "rewards/margins": 0.7798011302947998, "rewards/rejected": -2.3855223655700684, "sft_loss": 1.6923332214355469, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 11.33617656132988, "learning_rate": 2.483253751609823e-06, "logits/chosen": -0.18652328848838806, "logits/rejected": 0.10901524871587753, "logps/chosen": -1.7739999294281006, "logps/rejected": -2.652946949005127, "loss": 0.6709, "rewards/accuracies": 0.75, "rewards/chosen": -1.7739999294281006, "rewards/margins": 0.8789469599723816, "rewards/rejected": -2.652946949005127, "sft_loss": 1.8353183269500732, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 9.74986680394346, "learning_rate": 2.4797205687674608e-06, "logits/chosen": -0.1306121051311493, "logits/rejected": 0.02134639583528042, "logps/chosen": -1.7014802694320679, "logps/rejected": -2.6165051460266113, "loss": 0.6604, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7014802694320679, "rewards/margins": 0.9150252342224121, "rewards/rejected": -2.6165051460266113, "sft_loss": 1.692243218421936, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 5.831908928059212, "learning_rate": 2.476177880671843e-06, "logits/chosen": -0.2056030035018921, "logits/rejected": 0.04198329523205757, "logps/chosen": -1.7714817523956299, "logps/rejected": -2.8920578956604004, "loss": 0.6578, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7714817523956299, "rewards/margins": 1.12057626247406, "rewards/rejected": -2.8920578956604004, "sft_loss": 1.8091742992401123, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 7.788069753673046, "learning_rate": 2.4726257216941463e-06, "logits/chosen": -0.04350770264863968, "logits/rejected": 0.2956755757331848, "logps/chosen": -1.7248331308364868, "logps/rejected": -2.539656162261963, "loss": 0.6874, "rewards/accuracies": 0.75, "rewards/chosen": -1.7248331308364868, "rewards/margins": 0.8148230314254761, "rewards/rejected": -2.539656162261963, "sft_loss": 1.768866777420044, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 8.523253347431723, "learning_rate": 2.4690641262974317e-06, "logits/chosen": -0.04888535290956497, "logits/rejected": 0.06253420561552048, "logps/chosen": -1.5181758403778076, "logps/rejected": -2.1794214248657227, "loss": 0.6798, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5181758403778076, "rewards/margins": 0.6612456440925598, "rewards/rejected": -2.1794214248657227, "sft_loss": 1.5654313564300537, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 7.259280654329731, "learning_rate": 2.4654931290363135e-06, "logits/chosen": 0.0067944610491395, "logits/rejected": 0.055947817862033844, "logps/chosen": -1.548032522201538, "logps/rejected": -2.189936876296997, "loss": 0.6885, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.548032522201538, "rewards/margins": 0.6419044733047485, "rewards/rejected": -2.189936876296997, "sft_loss": 1.6117805242538452, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 6.715053666512424, "learning_rate": 2.461912764556623e-06, "logits/chosen": 0.05709873512387276, "logits/rejected": 0.1856270730495453, "logps/chosen": -1.5187469720840454, "logps/rejected": -2.3932878971099854, "loss": 0.6485, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5187469720840454, "rewards/margins": 0.8745408058166504, "rewards/rejected": -2.3932878971099854, "sft_loss": 1.5913379192352295, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 6.017921568556773, "learning_rate": 2.4583230675950717e-06, "logits/chosen": -0.05736943334341049, "logits/rejected": 0.16763488948345184, "logps/chosen": -1.5151335000991821, "logps/rejected": -2.175865888595581, "loss": 0.6878, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5151335000991821, "rewards/margins": 0.6607326865196228, "rewards/rejected": -2.175865888595581, "sft_loss": 1.5635812282562256, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 6.6961536875898275, "learning_rate": 2.4547240729789156e-06, "logits/chosen": 0.06216694042086601, "logits/rejected": 0.18044105172157288, "logps/chosen": -1.4949309825897217, "logps/rejected": -2.222695827484131, "loss": 0.6665, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4949309825897217, "rewards/margins": 0.7277650833129883, "rewards/rejected": -2.222695827484131, "sft_loss": 1.5416090488433838, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 8.87357434364449, "learning_rate": 2.451115815625617e-06, "logits/chosen": 0.10345292091369629, "logits/rejected": 0.2729285657405853, "logps/chosen": -1.700848937034607, "logps/rejected": -2.4436535835266113, "loss": 0.7246, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.700848937034607, "rewards/margins": 0.7428046464920044, "rewards/rejected": -2.4436535835266113, "sft_loss": 1.7029168605804443, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 8.704491288742465, "learning_rate": 2.4474983305425025e-06, "logits/chosen": -0.024038607254624367, "logits/rejected": 0.22523216903209686, "logps/chosen": -1.666006326675415, "logps/rejected": -2.2513585090637207, "loss": 0.7235, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.666006326675415, "rewards/margins": 0.58535236120224, "rewards/rejected": -2.2513585090637207, "sft_loss": 1.6220099925994873, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 8.784010034526936, "learning_rate": 2.4438716528264307e-06, "logits/chosen": -0.11897747218608856, "logits/rejected": 0.03994118422269821, "logps/chosen": -1.6286365985870361, "logps/rejected": -2.2337546348571777, "loss": 0.6894, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6286365985870361, "rewards/margins": 0.605117917060852, "rewards/rejected": -2.2337546348571777, "sft_loss": 1.6159461736679077, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 10.821060177471066, "learning_rate": 2.440235817663443e-06, "logits/chosen": 0.013584012165665627, "logits/rejected": 0.22464172542095184, "logps/chosen": -1.5461862087249756, "logps/rejected": -2.386045455932617, "loss": 0.6557, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5461862087249756, "rewards/margins": 0.8398593068122864, "rewards/rejected": -2.386045455932617, "sft_loss": 1.5868008136749268, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 13.482773873106167, "learning_rate": 2.4365908603284285e-06, "logits/chosen": -0.14218154549598694, "logits/rejected": 0.0756453424692154, "logps/chosen": -1.697597861289978, "logps/rejected": -2.438603162765503, "loss": 0.7358, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.697597861289978, "rewards/margins": 0.7410050630569458, "rewards/rejected": -2.438603162765503, "sft_loss": 1.698129415512085, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": 0.37108588218688965, "eval_logits/rejected": 0.5037340521812439, "eval_logps/chosen": -1.6992080211639404, "eval_logps/rejected": -2.333348035812378, "eval_loss": 0.7325444221496582, "eval_rewards/accuracies": 0.6824925541877747, "eval_rewards/chosen": -1.6992080211639404, "eval_rewards/margins": 0.6341398358345032, "eval_rewards/rejected": -2.333348035812378, "eval_runtime": 44.5266, "eval_samples_per_second": 30.207, "eval_sft_loss": 1.6954635381698608, "eval_steps_per_second": 7.569, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 7.605727648934152, "learning_rate": 2.4329368161847796e-06, "logits/chosen": -0.07299210876226425, "logits/rejected": 0.017824098467826843, "logps/chosen": -1.6549274921417236, "logps/rejected": -2.272775888442993, "loss": 0.7452, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6549274921417236, "rewards/margins": 0.6178484559059143, "rewards/rejected": -2.272775888442993, "sft_loss": 1.7160135507583618, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 9.74872118532032, "learning_rate": 2.4292737206840483e-06, "logits/chosen": -0.0690990686416626, "logits/rejected": 0.09985318779945374, "logps/chosen": -1.5765421390533447, "logps/rejected": -2.249239444732666, "loss": 0.6883, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5765421390533447, "rewards/margins": 0.6726974248886108, "rewards/rejected": -2.249239444732666, "sft_loss": 1.6347439289093018, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 8.904622520925692, "learning_rate": 2.4256016093656035e-06, "logits/chosen": -0.05548501014709473, "logits/rejected": 0.13582788407802582, "logps/chosen": -1.6699409484863281, "logps/rejected": -2.430842876434326, "loss": 0.6711, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6699409484863281, "rewards/margins": 0.7609016299247742, "rewards/rejected": -2.430842876434326, "sft_loss": 1.666124701499939, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 9.258511250573624, "learning_rate": 2.421920517856285e-06, "logits/chosen": -0.24321499466896057, "logits/rejected": 0.07653270661830902, "logps/chosen": -1.772458791732788, "logps/rejected": -2.632817506790161, "loss": 0.6476, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.772458791732788, "rewards/margins": 0.8603585958480835, "rewards/rejected": -2.632817506790161, "sft_loss": 1.785118818283081, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 8.845285976389446, "learning_rate": 2.418230481870058e-06, "logits/chosen": -0.09658778458833694, "logits/rejected": 0.12872019410133362, "logps/chosen": -1.7949399948120117, "logps/rejected": -2.6743321418762207, "loss": 0.6967, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7949399948120117, "rewards/margins": 0.8793922662734985, "rewards/rejected": -2.6743321418762207, "sft_loss": 1.900602102279663, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 6.76917907946159, "learning_rate": 2.41453153720767e-06, "logits/chosen": -0.10698221623897552, "logits/rejected": -0.07826292514801025, "logps/chosen": -1.686453104019165, "logps/rejected": -2.3061447143554688, "loss": 0.7118, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.686453104019165, "rewards/margins": 0.6196914911270142, "rewards/rejected": -2.3061447143554688, "sft_loss": 1.7430803775787354, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 14.528060052297567, "learning_rate": 2.4108237197562963e-06, "logits/chosen": -0.15176725387573242, "logits/rejected": 0.0883040651679039, "logps/chosen": -1.708646535873413, "logps/rejected": -2.4836783409118652, "loss": 0.6979, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.708646535873413, "rewards/margins": 0.7750316858291626, "rewards/rejected": -2.4836783409118652, "sft_loss": 1.7798616886138916, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 67.9308800768942, "learning_rate": 2.407107065489199e-06, "logits/chosen": -0.177690327167511, "logits/rejected": -0.0974874347448349, "logps/chosen": -1.7028429508209229, "logps/rejected": -2.4875786304473877, "loss": 0.7027, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7028429508209229, "rewards/margins": 0.7847355604171753, "rewards/rejected": -2.4875786304473877, "sft_loss": 1.746551513671875, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 9.402312106224876, "learning_rate": 2.403381610465374e-06, "logits/chosen": -0.12390387058258057, "logits/rejected": -0.05445731431245804, "logps/chosen": -1.6808083057403564, "logps/rejected": -2.3618881702423096, "loss": 0.6685, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6808083057403564, "rewards/margins": 0.6810798645019531, "rewards/rejected": -2.3618881702423096, "sft_loss": 1.643741250038147, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 8.707169869649329, "learning_rate": 2.3996473908292017e-06, "logits/chosen": -0.3125987946987152, "logits/rejected": -0.17020562291145325, "logps/chosen": -1.5637733936309814, "logps/rejected": -2.0839359760284424, "loss": 0.7291, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5637733936309814, "rewards/margins": 0.5201625823974609, "rewards/rejected": -2.0839359760284424, "sft_loss": 1.6524698734283447, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 8.344695734884622, "learning_rate": 2.3959044428100985e-06, "logits/chosen": -0.30680757761001587, "logits/rejected": -0.17020884156227112, "logps/chosen": -1.5836832523345947, "logps/rejected": -2.2736763954162598, "loss": 0.6871, "rewards/accuracies": 0.75, "rewards/chosen": -1.5836832523345947, "rewards/margins": 0.6899932622909546, "rewards/rejected": -2.2736763954162598, "sft_loss": 1.6809183359146118, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 8.356682337941303, "learning_rate": 2.392152802722162e-06, "logits/chosen": -0.3003597855567932, "logits/rejected": -0.26015612483024597, "logps/chosen": -1.6600055694580078, "logps/rejected": -2.41102933883667, "loss": 0.6863, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6600055694580078, "rewards/margins": 0.7510236501693726, "rewards/rejected": -2.41102933883667, "sft_loss": 1.7412002086639404, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 7.936263142727307, "learning_rate": 2.38839250696382e-06, "logits/chosen": -0.39390525221824646, "logits/rejected": -0.2608678936958313, "logps/chosen": -1.6280196905136108, "logps/rejected": -2.410038709640503, "loss": 0.6648, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6280196905136108, "rewards/margins": 0.7820190787315369, "rewards/rejected": -2.410038709640503, "sft_loss": 1.6248575448989868, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 7.693840717242871, "learning_rate": 2.3846235920174794e-06, "logits/chosen": -0.38650065660476685, "logits/rejected": -0.23895108699798584, "logps/chosen": -1.5643054246902466, "logps/rejected": -2.4510586261749268, "loss": 0.6087, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5643054246902466, "rewards/margins": 0.886753261089325, "rewards/rejected": -2.4510586261749268, "sft_loss": 1.6124528646469116, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 11.677231302491558, "learning_rate": 2.380846094449169e-06, "logits/chosen": -0.43027549982070923, "logits/rejected": -0.3176945447921753, "logps/chosen": -1.7545477151870728, "logps/rejected": -2.565573215484619, "loss": 0.684, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7545477151870728, "rewards/margins": 0.8110259175300598, "rewards/rejected": -2.565573215484619, "sft_loss": 1.8772166967391968, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 8.634788691396096, "learning_rate": 2.3770600509081872e-06, "logits/chosen": -0.48604917526245117, "logits/rejected": -0.2944543957710266, "logps/chosen": -1.668334722518921, "logps/rejected": -2.3512473106384277, "loss": 0.6855, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.668334722518921, "rewards/margins": 0.6829127073287964, "rewards/rejected": -2.3512473106384277, "sft_loss": 1.7718915939331055, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 13.019395453877044, "learning_rate": 2.373265498126745e-06, "logits/chosen": -0.44035762548446655, "logits/rejected": -0.33537784218788147, "logps/chosen": -1.7430732250213623, "logps/rejected": -2.501746654510498, "loss": 0.6909, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7430732250213623, "rewards/margins": 0.7586732506752014, "rewards/rejected": -2.501746654510498, "sft_loss": 1.801311731338501, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 7.660138579765188, "learning_rate": 2.36946247291961e-06, "logits/chosen": -0.46764713525772095, "logits/rejected": -0.480417400598526, "logps/chosen": -1.7001616954803467, "logps/rejected": -2.311063051223755, "loss": 0.7318, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7001616954803467, "rewards/margins": 0.6109012365341187, "rewards/rejected": -2.311063051223755, "sft_loss": 1.8168401718139648, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 9.935148956573007, "learning_rate": 2.3656510121837492e-06, "logits/chosen": -0.474683940410614, "logits/rejected": -0.3218488097190857, "logps/chosen": -1.7965023517608643, "logps/rejected": -2.385735034942627, "loss": 0.7291, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7965023517608643, "rewards/margins": 0.5892330408096313, "rewards/rejected": -2.385735034942627, "sft_loss": 1.8091661930084229, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 11.440049691581173, "learning_rate": 2.3618311528979717e-06, "logits/chosen": -0.33886945247650146, "logits/rejected": -0.2977084219455719, "logps/chosen": -1.750841498374939, "logps/rejected": -2.3571510314941406, "loss": 0.7186, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.750841498374939, "rewards/margins": 0.6063095331192017, "rewards/rejected": -2.3571510314941406, "sft_loss": 1.7378523349761963, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 14.193161191328313, "learning_rate": 2.3580029321225692e-06, "logits/chosen": -0.32112354040145874, "logits/rejected": -0.17997024953365326, "logps/chosen": -1.6737353801727295, "logps/rejected": -2.603375196456909, "loss": 0.6354, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6737353801727295, "rewards/margins": 0.9296396374702454, "rewards/rejected": -2.603375196456909, "sft_loss": 1.660453200340271, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 6.997715864560877, "learning_rate": 2.354166386998956e-06, "logits/chosen": -0.4019525945186615, "logits/rejected": -0.23506435751914978, "logps/chosen": -1.623871088027954, "logps/rejected": -2.5584957599639893, "loss": 0.6729, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.623871088027954, "rewards/margins": 0.9346246719360352, "rewards/rejected": -2.5584957599639893, "sft_loss": 1.688677430152893, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 9.19616365742752, "learning_rate": 2.3503215547493097e-06, "logits/chosen": -0.2555497884750366, "logits/rejected": -0.21385636925697327, "logps/chosen": -1.6596953868865967, "logps/rejected": -2.4315128326416016, "loss": 0.7161, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6596953868865967, "rewards/margins": 0.7718173861503601, "rewards/rejected": -2.4315128326416016, "sft_loss": 1.7026008367538452, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 10.297571955288168, "learning_rate": 2.3464684726762104e-06, "logits/chosen": -0.3922201991081238, "logits/rejected": -0.36036157608032227, "logps/chosen": -1.6120132207870483, "logps/rejected": -2.2474522590637207, "loss": 0.7029, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6120132207870483, "rewards/margins": 0.6354392170906067, "rewards/rejected": -2.2474522590637207, "sft_loss": 1.6690866947174072, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 6.7377464975333625, "learning_rate": 2.342607178162276e-06, "logits/chosen": -0.37472158670425415, "logits/rejected": -0.29021531343460083, "logps/chosen": -1.583504319190979, "logps/rejected": -2.4073872566223145, "loss": 0.6434, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.583504319190979, "rewards/margins": 0.823883056640625, "rewards/rejected": -2.4073872566223145, "sft_loss": 1.5988887548446655, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 11.760038410167141, "learning_rate": 2.338737708669804e-06, "logits/chosen": -0.323408305644989, "logits/rejected": -0.0538100004196167, "logps/chosen": -1.6298744678497314, "logps/rejected": -2.435746669769287, "loss": 0.6673, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6298744678497314, "rewards/margins": 0.8058720827102661, "rewards/rejected": -2.435746669769287, "sft_loss": 1.6865326166152954, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 8.472135920026245, "learning_rate": 2.334860101740404e-06, "logits/chosen": -0.3944898545742035, "logits/rejected": -0.21247251331806183, "logps/chosen": -1.6303770542144775, "logps/rejected": -2.5265893936157227, "loss": 0.6389, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6303770542144775, "rewards/margins": 0.8962124586105347, "rewards/rejected": -2.5265893936157227, "sft_loss": 1.6558125019073486, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 12.639854069616208, "learning_rate": 2.330974394994635e-06, "logits/chosen": -0.4155654311180115, "logits/rejected": -0.2664666771888733, "logps/chosen": -1.7755460739135742, "logps/rejected": -2.537376880645752, "loss": 0.7204, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7755460739135742, "rewards/margins": 0.7618308067321777, "rewards/rejected": -2.537376880645752, "sft_loss": 1.8307090997695923, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 8.78714107660372, "learning_rate": 2.327080626131641e-06, "logits/chosen": -0.4042625427246094, "logits/rejected": -0.3261919915676117, "logps/chosen": -1.6089550256729126, "logps/rejected": -2.6261086463928223, "loss": 0.6509, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6089550256729126, "rewards/margins": 1.0171538591384888, "rewards/rejected": -2.6261086463928223, "sft_loss": 1.7027835845947266, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 7.584058614904452, "learning_rate": 2.3231788329287855e-06, "logits/chosen": -0.45290595293045044, "logits/rejected": -0.39946696162223816, "logps/chosen": -1.7527068853378296, "logps/rejected": -2.5772976875305176, "loss": 0.6657, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7527068853378296, "rewards/margins": 0.8245910406112671, "rewards/rejected": -2.5772976875305176, "sft_loss": 1.7863349914550781, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 10.128455616245532, "learning_rate": 2.3192690532412827e-06, "logits/chosen": -0.3807825744152069, "logits/rejected": -0.29959550499916077, "logps/chosen": -1.7997996807098389, "logps/rejected": -2.4800992012023926, "loss": 0.7069, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7997996807098389, "rewards/margins": 0.680299699306488, "rewards/rejected": -2.4800992012023926, "sft_loss": 1.8867028951644897, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 12.687593297925929, "learning_rate": 2.315351325001832e-06, "logits/chosen": -0.4369320273399353, "logits/rejected": -0.34756121039390564, "logps/chosen": -1.7206451892852783, "logps/rejected": -2.6387972831726074, "loss": 0.6661, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7206451892852783, "rewards/margins": 0.9181524515151978, "rewards/rejected": -2.6387972831726074, "sft_loss": 1.7650476694107056, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 9.336298496804801, "learning_rate": 2.3114256862202495e-06, "logits/chosen": -0.3852900564670563, "logits/rejected": -0.19119954109191895, "logps/chosen": -1.6125036478042603, "logps/rejected": -2.5013768672943115, "loss": 0.6474, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6125036478042603, "rewards/margins": 0.888873279094696, "rewards/rejected": -2.5013768672943115, "sft_loss": 1.68994140625, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 6.085677306048667, "learning_rate": 2.3074921749831013e-06, "logits/chosen": -0.33006614446640015, "logits/rejected": -0.14145353436470032, "logps/chosen": -1.5735447406768799, "logps/rejected": -2.319394826889038, "loss": 0.6759, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5735447406768799, "rewards/margins": 0.7458503246307373, "rewards/rejected": -2.319394826889038, "sft_loss": 1.598747968673706, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 7.322613120246413, "learning_rate": 2.30355082945333e-06, "logits/chosen": -0.32234734296798706, "logits/rejected": -0.10814448446035385, "logps/chosen": -1.6050329208374023, "logps/rejected": -2.175632953643799, "loss": 0.6955, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6050329208374023, "rewards/margins": 0.5705999135971069, "rewards/rejected": -2.175632953643799, "sft_loss": 1.65104079246521, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 6.753212414597791, "learning_rate": 2.2996016878698866e-06, "logits/chosen": -0.37758100032806396, "logits/rejected": -0.2867276072502136, "logps/chosen": -1.5658433437347412, "logps/rejected": -2.327582597732544, "loss": 0.6572, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5658433437347412, "rewards/margins": 0.7617393732070923, "rewards/rejected": -2.327582597732544, "sft_loss": 1.6486505270004272, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 9.635485572560357, "learning_rate": 2.2956447885473607e-06, "logits/chosen": -0.3054151237010956, "logits/rejected": -0.12504413723945618, "logps/chosen": -1.6871017217636108, "logps/rejected": -2.3588860034942627, "loss": 0.6751, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6871017217636108, "rewards/margins": 0.6717841029167175, "rewards/rejected": -2.3588860034942627, "sft_loss": 1.712669014930725, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 8.445965479118785, "learning_rate": 2.2916801698756063e-06, "logits/chosen": -0.20240584015846252, "logits/rejected": -0.16871638596057892, "logps/chosen": -1.7465060949325562, "logps/rejected": -2.494576930999756, "loss": 0.7111, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7465060949325562, "rewards/margins": 0.7480708360671997, "rewards/rejected": -2.494576930999756, "sft_loss": 1.8895080089569092, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 10.299835270794993, "learning_rate": 2.287707870319372e-06, "logits/chosen": -0.30202627182006836, "logits/rejected": -0.18142291903495789, "logps/chosen": -1.794844627380371, "logps/rejected": -2.659106969833374, "loss": 0.7158, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.794844627380371, "rewards/margins": 0.8642624616622925, "rewards/rejected": -2.659106969833374, "sft_loss": 1.8789043426513672, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 7.726045950177308, "learning_rate": 2.283727928417925e-06, "logits/chosen": -0.3212895095348358, "logits/rejected": -0.3393111824989319, "logps/chosen": -1.6944576501846313, "logps/rejected": -2.492414951324463, "loss": 0.6955, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6944576501846313, "rewards/margins": 0.7979571223258972, "rewards/rejected": -2.492414951324463, "sft_loss": 1.7866626977920532, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 7.864280883375943, "learning_rate": 2.27974038278468e-06, "logits/chosen": -0.3887438178062439, "logits/rejected": -0.14263615012168884, "logps/chosen": -1.5852651596069336, "logps/rejected": -2.2923355102539062, "loss": 0.6687, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5852651596069336, "rewards/margins": 0.707070529460907, "rewards/rejected": -2.2923355102539062, "sft_loss": 1.6131317615509033, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 8.543298981272972, "learning_rate": 2.2757452721068206e-06, "logits/chosen": -0.34999722242355347, "logits/rejected": -0.22460785508155823, "logps/chosen": -1.4379886388778687, "logps/rejected": -2.2692742347717285, "loss": 0.6365, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4379886388778687, "rewards/margins": 0.8312854766845703, "rewards/rejected": -2.2692742347717285, "sft_loss": 1.5093804597854614, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 9.561382786924911, "learning_rate": 2.2717426351449294e-06, "logits/chosen": -0.32528331875801086, "logits/rejected": -0.219674751162529, "logps/chosen": -1.7008552551269531, "logps/rejected": -2.5366382598876953, "loss": 0.6654, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7008552551269531, "rewards/margins": 0.8357831239700317, "rewards/rejected": -2.5366382598876953, "sft_loss": 1.6650569438934326, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 9.799822625464461, "learning_rate": 2.2677325107326067e-06, "logits/chosen": -0.4135669767856598, "logits/rejected": -0.2664993405342102, "logps/chosen": -1.5439913272857666, "logps/rejected": -2.251284122467041, "loss": 0.7167, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5439913272857666, "rewards/margins": 0.7072926759719849, "rewards/rejected": -2.251284122467041, "sft_loss": 1.6164175271987915, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 10.801150636874329, "learning_rate": 2.2637149377760985e-06, "logits/chosen": -0.4365982413291931, "logits/rejected": -0.16107413172721863, "logps/chosen": -1.523956537246704, "logps/rejected": -2.4188663959503174, "loss": 0.6255, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.523956537246704, "rewards/margins": 0.894909679889679, "rewards/rejected": -2.4188663959503174, "sft_loss": 1.5979201793670654, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 12.468546658158331, "learning_rate": 2.2596899552539136e-06, "logits/chosen": -0.43537306785583496, "logits/rejected": -0.26023873686790466, "logps/chosen": -1.6386340856552124, "logps/rejected": -2.6180009841918945, "loss": 0.662, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6386340856552124, "rewards/margins": 0.9793673753738403, "rewards/rejected": -2.6180009841918945, "sft_loss": 1.6425418853759766, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 8.992344025907165, "learning_rate": 2.2556576022164516e-06, "logits/chosen": -0.4005703926086426, "logits/rejected": -0.16512002050876617, "logps/chosen": -1.6079756021499634, "logps/rejected": -2.5010671615600586, "loss": 0.649, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6079756021499634, "rewards/margins": 0.8930916786193848, "rewards/rejected": -2.5010671615600586, "sft_loss": 1.6396543979644775, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 6.610130419520371, "learning_rate": 2.2516179177856182e-06, "logits/chosen": -0.3925136625766754, "logits/rejected": -0.18449433147907257, "logps/chosen": -1.6953229904174805, "logps/rejected": -2.4890971183776855, "loss": 0.6671, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6953229904174805, "rewards/margins": 0.7937743067741394, "rewards/rejected": -2.4890971183776855, "sft_loss": 1.7729851007461548, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 8.240460161632697, "learning_rate": 2.2475709411544503e-06, "logits/chosen": -0.30860430002212524, "logits/rejected": -0.2732142508029938, "logps/chosen": -1.6084105968475342, "logps/rejected": -2.398030996322632, "loss": 0.6585, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6084105968475342, "rewards/margins": 0.7896206378936768, "rewards/rejected": -2.398030996322632, "sft_loss": 1.6830050945281982, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 10.591418712449052, "learning_rate": 2.2435167115867325e-06, "logits/chosen": -0.22040650248527527, "logits/rejected": -0.18510958552360535, "logps/chosen": -1.6661109924316406, "logps/rejected": -2.5022969245910645, "loss": 0.6552, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6661109924316406, "rewards/margins": 0.8361861109733582, "rewards/rejected": -2.5022969245910645, "sft_loss": 1.700455665588379, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 10.77881916958613, "learning_rate": 2.239455268416618e-06, "logits/chosen": -0.3600291609764099, "logits/rejected": -0.2691943049430847, "logps/chosen": -1.7021760940551758, "logps/rejected": -2.4420957565307617, "loss": 0.7208, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7021760940551758, "rewards/margins": 0.7399194240570068, "rewards/rejected": -2.4420957565307617, "sft_loss": 1.7061984539031982, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 9.660888461433778, "learning_rate": 2.2353866510482463e-06, "logits/chosen": -0.2715977430343628, "logits/rejected": -0.2946680188179016, "logps/chosen": -1.6520048379898071, "logps/rejected": -2.2550692558288574, "loss": 0.7035, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6520048379898071, "rewards/margins": 0.603064239025116, "rewards/rejected": -2.2550692558288574, "sft_loss": 1.6675376892089844, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 7.99149684732698, "learning_rate": 2.231310898955361e-06, "logits/chosen": -0.3659631609916687, "logits/rejected": -0.27866271138191223, "logps/chosen": -1.6420866250991821, "logps/rejected": -2.4726719856262207, "loss": 0.6717, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6420866250991821, "rewards/margins": 0.8305851221084595, "rewards/rejected": -2.4726719856262207, "sft_loss": 1.739808440208435, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 11.614010206633175, "learning_rate": 2.2272280516809262e-06, "logits/chosen": -0.46790918707847595, "logits/rejected": -0.265636146068573, "logps/chosen": -1.6137558221817017, "logps/rejected": -2.491194248199463, "loss": 0.6536, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6137558221817017, "rewards/margins": 0.8774384260177612, "rewards/rejected": -2.491194248199463, "sft_loss": 1.6100658178329468, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 11.325637763794075, "learning_rate": 2.2231381488367447e-06, "logits/chosen": -0.3395164906978607, "logits/rejected": -0.18752549588680267, "logps/chosen": -1.6160014867782593, "logps/rejected": -2.5542397499084473, "loss": 0.6362, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6160014867782593, "rewards/margins": 0.9382384419441223, "rewards/rejected": -2.5542397499084473, "sft_loss": 1.6488006114959717, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 8.49763007403377, "learning_rate": 2.2190412301030717e-06, "logits/chosen": -0.4595802426338196, "logits/rejected": -0.24137744307518005, "logps/chosen": -1.5575565099716187, "logps/rejected": -2.3483619689941406, "loss": 0.6619, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5575565099716187, "rewards/margins": 0.7908056378364563, "rewards/rejected": -2.3483619689941406, "sft_loss": 1.6235309839248657, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 6.357494880268698, "learning_rate": 2.2149373352282307e-06, "logits/chosen": -0.4294605255126953, "logits/rejected": -0.20918670296669006, "logps/chosen": -1.781286597251892, "logps/rejected": -2.775089740753174, "loss": 0.641, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.781286597251892, "rewards/margins": 0.9938033819198608, "rewards/rejected": -2.775089740753174, "sft_loss": 1.7867755889892578, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 8.736802708009721, "learning_rate": 2.2108265040282275e-06, "logits/chosen": -0.5402215719223022, "logits/rejected": -0.37220120429992676, "logps/chosen": -1.6326793432235718, "logps/rejected": -2.4899330139160156, "loss": 0.6923, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6326793432235718, "rewards/margins": 0.8572534322738647, "rewards/rejected": -2.4899330139160156, "sft_loss": 1.692399263381958, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 8.117762434913253, "learning_rate": 2.2067087763863644e-06, "logits/chosen": -0.4696386456489563, "logits/rejected": -0.37050861120224, "logps/chosen": -1.6940395832061768, "logps/rejected": -2.565840721130371, "loss": 0.6976, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6940395832061768, "rewards/margins": 0.8718010783195496, "rewards/rejected": -2.565840721130371, "sft_loss": 1.8267343044281006, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 12.180098640706655, "learning_rate": 2.202584192252854e-06, "logits/chosen": -0.42436861991882324, "logits/rejected": -0.29395556449890137, "logps/chosen": -1.6638511419296265, "logps/rejected": -2.3967151641845703, "loss": 0.7281, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6638511419296265, "rewards/margins": 0.7328639626502991, "rewards/rejected": -2.3967151641845703, "sft_loss": 1.7181631326675415, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 8.704967823405475, "learning_rate": 2.1984527916444283e-06, "logits/chosen": -0.4253556728363037, "logits/rejected": -0.28073593974113464, "logps/chosen": -1.7703256607055664, "logps/rejected": -2.572084426879883, "loss": 0.6992, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7703256607055664, "rewards/margins": 0.8017589449882507, "rewards/rejected": -2.572084426879883, "sft_loss": 1.7371975183486938, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 10.179021449194039, "learning_rate": 2.1943146146439557e-06, "logits/chosen": -0.35127347707748413, "logits/rejected": -0.03087422251701355, "logps/chosen": -1.6630665063858032, "logps/rejected": -2.5277600288391113, "loss": 0.6639, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6630665063858032, "rewards/margins": 0.8646937608718872, "rewards/rejected": -2.5277600288391113, "sft_loss": 1.6645368337631226, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 9.438073157043084, "learning_rate": 2.190169701400046e-06, "logits/chosen": -0.40794649720191956, "logits/rejected": -0.1953791379928589, "logps/chosen": -1.686069130897522, "logps/rejected": -2.6183743476867676, "loss": 0.649, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.686069130897522, "rewards/margins": 0.9323051571846008, "rewards/rejected": -2.6183743476867676, "sft_loss": 1.7290821075439453, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 7.470935986197104, "learning_rate": 2.186018092126666e-06, "logits/chosen": -0.2606565058231354, "logits/rejected": -0.22562718391418457, "logps/chosen": -1.6814906597137451, "logps/rejected": -2.4694440364837646, "loss": 0.6754, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6814906597137451, "rewards/margins": 0.7879533767700195, "rewards/rejected": -2.4694440364837646, "sft_loss": 1.7424710988998413, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 6.658630183259569, "learning_rate": 2.181859827102748e-06, "logits/chosen": -0.2056102305650711, "logits/rejected": -0.11757795512676239, "logps/chosen": -1.7347863912582397, "logps/rejected": -2.6536970138549805, "loss": 0.6324, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7347863912582397, "rewards/margins": 0.9189106225967407, "rewards/rejected": -2.6536970138549805, "sft_loss": 1.7204700708389282, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 9.669308852371636, "learning_rate": 2.1776949466717967e-06, "logits/chosen": -0.4139643609523773, "logits/rejected": -0.27617889642715454, "logps/chosen": -1.7208669185638428, "logps/rejected": -2.599398612976074, "loss": 0.6994, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7208669185638428, "rewards/margins": 0.8785317540168762, "rewards/rejected": -2.599398612976074, "sft_loss": 1.7972770929336548, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 9.718597987579729, "learning_rate": 2.1735234912415007e-06, "logits/chosen": -0.25888964533805847, "logits/rejected": -0.17799128592014313, "logps/chosen": -1.749355673789978, "logps/rejected": -2.605766534805298, "loss": 0.6546, "rewards/accuracies": 0.8125, "rewards/chosen": -1.749355673789978, "rewards/margins": 0.8564105033874512, "rewards/rejected": -2.605766534805298, "sft_loss": 1.746603012084961, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 9.033444917435077, "learning_rate": 2.1693455012833388e-06, "logits/chosen": -0.4220157563686371, "logits/rejected": -0.1659930944442749, "logps/chosen": -1.6804784536361694, "logps/rejected": -2.5973219871520996, "loss": 0.6568, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6804784536361694, "rewards/margins": 0.9168437123298645, "rewards/rejected": -2.5973219871520996, "sft_loss": 1.7155706882476807, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 10.432645490776679, "learning_rate": 2.1651610173321877e-06, "logits/chosen": -0.31514835357666016, "logits/rejected": -0.1127225011587143, "logps/chosen": -1.734143853187561, "logps/rejected": -2.6728904247283936, "loss": 0.6587, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.734143853187561, "rewards/margins": 0.9387462735176086, "rewards/rejected": -2.6728904247283936, "sft_loss": 1.746917963027954, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 8.798045041122512, "learning_rate": 2.1609700799859287e-06, "logits/chosen": -0.2637856602668762, "logits/rejected": -0.07604242861270905, "logps/chosen": -1.7247288227081299, "logps/rejected": -2.5924761295318604, "loss": 0.6642, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7247288227081299, "rewards/margins": 0.8677471280097961, "rewards/rejected": -2.5924761295318604, "sft_loss": 1.7420657873153687, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 12.482508611552822, "learning_rate": 2.1567727299050555e-06, "logits/chosen": -0.18308880925178528, "logits/rejected": -0.01936594396829605, "logps/chosen": -1.6722066402435303, "logps/rejected": -2.7605667114257812, "loss": 0.6528, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6722066402435303, "rewards/margins": 1.0883598327636719, "rewards/rejected": -2.7605667114257812, "sft_loss": 1.7239611148834229, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 10.514418161646946, "learning_rate": 2.152569007812276e-06, "logits/chosen": -0.25235840678215027, "logits/rejected": -0.04004128649830818, "logps/chosen": -1.748984932899475, "logps/rejected": -2.873537063598633, "loss": 0.6379, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.748984932899475, "rewards/margins": 1.1245522499084473, "rewards/rejected": -2.873537063598633, "sft_loss": 1.8425092697143555, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 9.25848867545407, "learning_rate": 2.1483589544921202e-06, "logits/chosen": -0.1993493139743805, "logits/rejected": 0.022964054718613625, "logps/chosen": -1.7990487813949585, "logps/rejected": -2.8367419242858887, "loss": 0.6619, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7990487813949585, "rewards/margins": 1.0376932621002197, "rewards/rejected": -2.8367419242858887, "sft_loss": 1.8404858112335205, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 10.107413557567574, "learning_rate": 2.144142610790545e-06, "logits/chosen": -0.17361178994178772, "logits/rejected": 0.04688585549592972, "logps/chosen": -1.7829437255859375, "logps/rejected": -2.7042758464813232, "loss": 0.652, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7829437255859375, "rewards/margins": 0.9213320016860962, "rewards/rejected": -2.7042758464813232, "sft_loss": 1.8419139385223389, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 9.000121174768116, "learning_rate": 2.1399200176145344e-06, "logits/chosen": -0.2919735908508301, "logits/rejected": -0.03339837118983269, "logps/chosen": -1.6188087463378906, "logps/rejected": -2.5424160957336426, "loss": 0.6578, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6188087463378906, "rewards/margins": 0.9236071705818176, "rewards/rejected": -2.5424160957336426, "sft_loss": 1.657747507095337, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 9.970070470489274, "learning_rate": 2.1356912159317067e-06, "logits/chosen": -0.20595316588878632, "logits/rejected": 0.05183644965291023, "logps/chosen": -1.8907188177108765, "logps/rejected": -2.9290881156921387, "loss": 0.6846, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8907188177108765, "rewards/margins": 1.0383695363998413, "rewards/rejected": -2.9290881156921387, "sft_loss": 1.9077237844467163, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 9.730449938501128, "learning_rate": 2.1314562467699133e-06, "logits/chosen": -0.10945296287536621, "logits/rejected": 0.033916451036930084, "logps/chosen": -1.73921799659729, "logps/rejected": -2.6286468505859375, "loss": 0.6592, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.73921799659729, "rewards/margins": 0.8894292116165161, "rewards/rejected": -2.6286468505859375, "sft_loss": 1.737277626991272, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 8.618815475427098, "learning_rate": 2.1272151512168453e-06, "logits/chosen": -0.09236228466033936, "logits/rejected": -0.020017240196466446, "logps/chosen": -1.6293678283691406, "logps/rejected": -2.670914888381958, "loss": 0.6069, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6293678283691406, "rewards/margins": 1.041547179222107, "rewards/rejected": -2.670914888381958, "sft_loss": 1.6871839761734009, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 8.556501943617222, "learning_rate": 2.122967970419629e-06, "logits/chosen": -0.3508361876010895, "logits/rejected": -0.13678878545761108, "logps/chosen": -1.6169979572296143, "logps/rejected": -2.42988920211792, "loss": 0.6512, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6169979572296143, "rewards/margins": 0.8128914833068848, "rewards/rejected": -2.42988920211792, "sft_loss": 1.6969658136367798, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 13.939047832399783, "learning_rate": 2.118714745584431e-06, "logits/chosen": -0.17858004570007324, "logits/rejected": 0.028571343049407005, "logps/chosen": -1.6750202178955078, "logps/rejected": -2.4766621589660645, "loss": 0.6698, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6750202178955078, "rewards/margins": 0.801642119884491, "rewards/rejected": -2.4766621589660645, "sft_loss": 1.7185461521148682, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": 0.3708411455154419, "eval_logits/rejected": 0.5028015375137329, "eval_logps/chosen": -1.8658210039138794, "eval_logps/rejected": -2.657525062561035, "eval_loss": 0.7332260012626648, "eval_rewards/accuracies": 0.7017804384231567, "eval_rewards/chosen": -1.8658210039138794, "eval_rewards/margins": 0.7917039394378662, "eval_rewards/rejected": -2.657525062561035, "eval_runtime": 44.466, "eval_samples_per_second": 30.248, "eval_sft_loss": 1.8307745456695557, "eval_steps_per_second": 7.579, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 7.013331295730143, "learning_rate": 2.1144555179760582e-06, "logits/chosen": -0.09877729415893555, "logits/rejected": 0.0729081779718399, "logps/chosen": -1.7698421478271484, "logps/rejected": -2.887197971343994, "loss": 0.647, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7698421478271484, "rewards/margins": 1.1173558235168457, "rewards/rejected": -2.887197971343994, "sft_loss": 1.81331467628479, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 11.39958801066816, "learning_rate": 2.110190328917555e-06, "logits/chosen": -0.23239700496196747, "logits/rejected": 0.026508072391152382, "logps/chosen": -1.7946094274520874, "logps/rejected": -2.501302480697632, "loss": 0.7146, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7946094274520874, "rewards/margins": 0.706693172454834, "rewards/rejected": -2.501302480697632, "sft_loss": 1.8153241872787476, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 5.811504402152971, "learning_rate": 2.1059192197898044e-06, "logits/chosen": -0.10666403919458389, "logits/rejected": 0.0190547164529562, "logps/chosen": -1.6683290004730225, "logps/rejected": -2.7554128170013428, "loss": 0.6516, "rewards/accuracies": 0.75, "rewards/chosen": -1.6683290004730225, "rewards/margins": 1.0870840549468994, "rewards/rejected": -2.7554128170013428, "sft_loss": 1.7128015756607056, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 9.912255314604257, "learning_rate": 2.1016422320311257e-06, "logits/chosen": -0.2023269683122635, "logits/rejected": -0.04736703261733055, "logps/chosen": -1.7052671909332275, "logps/rejected": -2.5506842136383057, "loss": 0.6552, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7052671909332275, "rewards/margins": 0.8454171419143677, "rewards/rejected": -2.5506842136383057, "sft_loss": 1.7814832925796509, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 7.379974362789548, "learning_rate": 2.097359407136873e-06, "logits/chosen": -0.18297527730464935, "logits/rejected": -0.0790904313325882, "logps/chosen": -1.5330262184143066, "logps/rejected": -2.1949267387390137, "loss": 0.6696, "rewards/accuracies": 0.75, "rewards/chosen": -1.5330262184143066, "rewards/margins": 0.6619004011154175, "rewards/rejected": -2.1949267387390137, "sft_loss": 1.6252708435058594, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 8.932389676446226, "learning_rate": 2.093070786659033e-06, "logits/chosen": -0.17055395245552063, "logits/rejected": -0.08287496864795685, "logps/chosen": -1.6713578701019287, "logps/rejected": -2.418421983718872, "loss": 0.689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6713578701019287, "rewards/margins": 0.7470639944076538, "rewards/rejected": -2.418421983718872, "sft_loss": 1.7103633880615234, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 6.242947917759311, "learning_rate": 2.0887764122058195e-06, "logits/chosen": -0.199593186378479, "logits/rejected": -0.01157109159976244, "logps/chosen": -1.6459394693374634, "logps/rejected": -2.19657039642334, "loss": 0.7567, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6459394693374634, "rewards/margins": 0.5506308078765869, "rewards/rejected": -2.19657039642334, "sft_loss": 1.6807940006256104, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 7.125673957478325, "learning_rate": 2.084476325441272e-06, "logits/chosen": -0.25697240233421326, "logits/rejected": -0.1042226105928421, "logps/chosen": -1.5503084659576416, "logps/rejected": -2.366608142852783, "loss": 0.6555, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5503084659576416, "rewards/margins": 0.8162997364997864, "rewards/rejected": -2.366608142852783, "sft_loss": 1.5341964960098267, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 10.415414680809302, "learning_rate": 2.0801705680848523e-06, "logits/chosen": -0.2940608859062195, "logits/rejected": -0.10844705253839493, "logps/chosen": -1.6265678405761719, "logps/rejected": -2.294830799102783, "loss": 0.6989, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6265678405761719, "rewards/margins": 0.668262779712677, "rewards/rejected": -2.294830799102783, "sft_loss": 1.6152546405792236, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 11.225044180641651, "learning_rate": 2.0758591819110364e-06, "logits/chosen": -0.27031898498535156, "logits/rejected": -0.07324796915054321, "logps/chosen": -1.5835988521575928, "logps/rejected": -2.4217069149017334, "loss": 0.65, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5835988521575928, "rewards/margins": 0.8381081819534302, "rewards/rejected": -2.4217069149017334, "sft_loss": 1.6001770496368408, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 8.44842759276969, "learning_rate": 2.071542208748912e-06, "logits/chosen": -0.3392409682273865, "logits/rejected": -0.014614325948059559, "logps/chosen": -1.6830085515975952, "logps/rejected": -2.501263380050659, "loss": 0.67, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6830085515975952, "rewards/margins": 0.8182545900344849, "rewards/rejected": -2.501263380050659, "sft_loss": 1.7062549591064453, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 15.463460340085286, "learning_rate": 2.0672196904817715e-06, "logits/chosen": -0.24148209393024445, "logits/rejected": -0.09284202754497528, "logps/chosen": -1.7609403133392334, "logps/rejected": -2.5735113620758057, "loss": 0.7034, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7609403133392334, "rewards/margins": 0.8125707507133484, "rewards/rejected": -2.5735113620758057, "sft_loss": 1.7544368505477905, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 7.50603733440921, "learning_rate": 2.0628916690467066e-06, "logits/chosen": -0.2833019196987152, "logits/rejected": -0.17756600677967072, "logps/chosen": -1.7068138122558594, "logps/rejected": -2.726548671722412, "loss": 0.6412, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7068138122558594, "rewards/margins": 1.019735336303711, "rewards/rejected": -2.726548671722412, "sft_loss": 1.6803268194198608, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 10.466797247061209, "learning_rate": 2.0585581864341995e-06, "logits/chosen": -0.4477129578590393, "logits/rejected": -0.2852556109428406, "logps/chosen": -1.763668417930603, "logps/rejected": -2.5296313762664795, "loss": 0.7053, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.763668417930603, "rewards/margins": 0.7659630179405212, "rewards/rejected": -2.5296313762664795, "sft_loss": 1.833539605140686, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 13.883944751015369, "learning_rate": 2.0542192846877177e-06, "logits/chosen": -0.2172568291425705, "logits/rejected": -0.15765947103500366, "logps/chosen": -1.688741683959961, "logps/rejected": -2.4696295261383057, "loss": 0.6615, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.688741683959961, "rewards/margins": 0.7808881402015686, "rewards/rejected": -2.4696295261383057, "sft_loss": 1.7033189535140991, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 7.087008884341495, "learning_rate": 2.049875005903305e-06, "logits/chosen": -0.36735719442367554, "logits/rejected": -0.13228847086429596, "logps/chosen": -1.671006441116333, "logps/rejected": -2.6411962509155273, "loss": 0.6407, "rewards/accuracies": 0.78125, "rewards/chosen": -1.671006441116333, "rewards/margins": 0.9701893925666809, "rewards/rejected": -2.6411962509155273, "sft_loss": 1.818447470664978, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 10.870812188582518, "learning_rate": 2.045525392229174e-06, "logits/chosen": -0.10764901340007782, "logits/rejected": 0.10250584781169891, "logps/chosen": -1.6554419994354248, "logps/rejected": -2.5783419609069824, "loss": 0.6931, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6554419994354248, "rewards/margins": 0.9229000806808472, "rewards/rejected": -2.5783419609069824, "sft_loss": 1.7975515127182007, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 12.510974976874456, "learning_rate": 2.0411704858652946e-06, "logits/chosen": -0.24468784034252167, "logits/rejected": -0.1647200733423233, "logps/chosen": -1.6226732730865479, "logps/rejected": -2.521732807159424, "loss": 0.6373, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6226732730865479, "rewards/margins": 0.8990596532821655, "rewards/rejected": -2.521732807159424, "sft_loss": 1.6786104440689087, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 10.37553722653345, "learning_rate": 2.0368103290629877e-06, "logits/chosen": -0.14447571337223053, "logits/rejected": -0.11947256326675415, "logps/chosen": -1.6237910985946655, "logps/rejected": -2.3520302772521973, "loss": 0.6834, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6237910985946655, "rewards/margins": 0.7282392382621765, "rewards/rejected": -2.3520302772521973, "sft_loss": 1.6659843921661377, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 12.08940901319854, "learning_rate": 2.0324449641245145e-06, "logits/chosen": -0.13080765306949615, "logits/rejected": 0.09991051256656647, "logps/chosen": -1.6093822717666626, "logps/rejected": -2.3164443969726562, "loss": 0.6842, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6093822717666626, "rewards/margins": 0.7070624232292175, "rewards/rejected": -2.3164443969726562, "sft_loss": 1.6584736108779907, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 6.952355678407748, "learning_rate": 2.028074433402664e-06, "logits/chosen": -0.14452217519283295, "logits/rejected": 0.09375777095556259, "logps/chosen": -1.6005226373672485, "logps/rejected": -2.555150270462036, "loss": 0.6405, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6005226373672485, "rewards/margins": 0.9546276926994324, "rewards/rejected": -2.555150270462036, "sft_loss": 1.6105680465698242, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 17.396140250961892, "learning_rate": 2.023698779300344e-06, "logits/chosen": -0.21792694926261902, "logits/rejected": -0.025183891877532005, "logps/chosen": -1.6324084997177124, "logps/rejected": -2.5301878452301025, "loss": 0.6377, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6324084997177124, "rewards/margins": 0.8977789878845215, "rewards/rejected": -2.5301878452301025, "sft_loss": 1.7066516876220703, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 7.09240283401546, "learning_rate": 2.019318044270171e-06, "logits/chosen": -0.10281024128198624, "logits/rejected": 0.037469811737537384, "logps/chosen": -1.694016695022583, "logps/rejected": -2.766887664794922, "loss": 0.6299, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.694016695022583, "rewards/margins": 1.0728710889816284, "rewards/rejected": -2.766887664794922, "sft_loss": 1.7665046453475952, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 7.659323922479531, "learning_rate": 2.0149322708140545e-06, "logits/chosen": -0.19877466559410095, "logits/rejected": -0.08290411531925201, "logps/chosen": -1.8053525686264038, "logps/rejected": -2.6758005619049072, "loss": 0.6598, "rewards/accuracies": 0.75, "rewards/chosen": -1.8053525686264038, "rewards/margins": 0.8704478144645691, "rewards/rejected": -2.6758005619049072, "sft_loss": 1.7328903675079346, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 16.288994541979754, "learning_rate": 2.0105415014827886e-06, "logits/chosen": -0.22056837379932404, "logits/rejected": -0.09690554440021515, "logps/chosen": -1.9371941089630127, "logps/rejected": -3.0349159240722656, "loss": 0.693, "rewards/accuracies": 0.75, "rewards/chosen": -1.9371941089630127, "rewards/margins": 1.0977216958999634, "rewards/rejected": -3.0349159240722656, "sft_loss": 2.0370259284973145, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 10.201814213064157, "learning_rate": 2.006145778875636e-06, "logits/chosen": -0.1919160783290863, "logits/rejected": -0.1108192577958107, "logps/chosen": -1.7822622060775757, "logps/rejected": -2.6225311756134033, "loss": 0.7171, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7822622060775757, "rewards/margins": 0.8402689695358276, "rewards/rejected": -2.6225311756134033, "sft_loss": 1.843174934387207, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 9.233419850296873, "learning_rate": 2.0017451456399165e-06, "logits/chosen": -0.28379935026168823, "logits/rejected": -0.08224531263113022, "logps/chosen": -1.7444589138031006, "logps/rejected": -2.6347079277038574, "loss": 0.6493, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7444589138031006, "rewards/margins": 0.8902491331100464, "rewards/rejected": -2.6347079277038574, "sft_loss": 1.7515875101089478, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 12.342674686825728, "learning_rate": 1.9973396444705934e-06, "logits/chosen": -0.16968822479248047, "logits/rejected": 0.08616310358047485, "logps/chosen": -1.7560157775878906, "logps/rejected": -2.532700300216675, "loss": 0.696, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7560157775878906, "rewards/margins": 0.7766846418380737, "rewards/rejected": -2.532700300216675, "sft_loss": 1.8065052032470703, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 11.152899998213138, "learning_rate": 1.9929293181098588e-06, "logits/chosen": -0.19315733015537262, "logits/rejected": 0.08167880773544312, "logps/chosen": -1.709506630897522, "logps/rejected": -2.686140298843384, "loss": 0.6498, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.709506630897522, "rewards/margins": 0.9766336679458618, "rewards/rejected": -2.686140298843384, "sft_loss": 1.7340914011001587, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 10.255858787571785, "learning_rate": 1.988514209346718e-06, "logits/chosen": -0.19811835885047913, "logits/rejected": 0.03873931244015694, "logps/chosen": -1.832733392715454, "logps/rejected": -2.5946521759033203, "loss": 0.7137, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.832733392715454, "rewards/margins": 0.7619189023971558, "rewards/rejected": -2.5946521759033203, "sft_loss": 1.8450162410736084, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 15.489292910590374, "learning_rate": 1.984094361016575e-06, "logits/chosen": -0.10327938944101334, "logits/rejected": 0.0570363774895668, "logps/chosen": -1.7769912481307983, "logps/rejected": -2.718773603439331, "loss": 0.7235, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7769912481307983, "rewards/margins": 0.9417825937271118, "rewards/rejected": -2.718773603439331, "sft_loss": 1.8660234212875366, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 11.174376711520368, "learning_rate": 1.9796698160008187e-06, "logits/chosen": -0.14350536465644836, "logits/rejected": 0.033099908381700516, "logps/chosen": -1.7975308895111084, "logps/rejected": -2.6718554496765137, "loss": 0.6903, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7975308895111084, "rewards/margins": 0.8743244409561157, "rewards/rejected": -2.6718554496765137, "sft_loss": 1.814619779586792, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 11.23257514840121, "learning_rate": 1.975240617226404e-06, "logits/chosen": -0.18500396609306335, "logits/rejected": 0.023821836337447166, "logps/chosen": -1.7192474603652954, "logps/rejected": -2.6152572631835938, "loss": 0.6758, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7192474603652954, "rewards/margins": 0.8960098028182983, "rewards/rejected": -2.6152572631835938, "sft_loss": 1.811990737915039, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 6.369737713360051, "learning_rate": 1.9708068076654364e-06, "logits/chosen": -0.05014703422784805, "logits/rejected": 0.04246297478675842, "logps/chosen": -1.6130955219268799, "logps/rejected": -2.40543794631958, "loss": 0.6577, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6130955219268799, "rewards/margins": 0.792342483997345, "rewards/rejected": -2.40543794631958, "sft_loss": 1.639413595199585, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 7.0527777236925795, "learning_rate": 1.966368430334756e-06, "logits/chosen": -0.18567633628845215, "logits/rejected": 0.03269173949956894, "logps/chosen": -1.575241208076477, "logps/rejected": -2.3695342540740967, "loss": 0.6552, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.575241208076477, "rewards/margins": 0.7942931056022644, "rewards/rejected": -2.3695342540740967, "sft_loss": 1.6236913204193115, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 8.006785224860513, "learning_rate": 1.961925528295519e-06, "logits/chosen": -0.0933329164981842, "logits/rejected": 0.025968652218580246, "logps/chosen": -1.6438376903533936, "logps/rejected": -2.2044715881347656, "loss": 0.7073, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6438376903533936, "rewards/margins": 0.5606337785720825, "rewards/rejected": -2.2044715881347656, "sft_loss": 1.7238785028457642, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 8.934485275229912, "learning_rate": 1.9574781446527806e-06, "logits/chosen": 0.0503157377243042, "logits/rejected": 0.274901419878006, "logps/chosen": -1.5226280689239502, "logps/rejected": -2.387205123901367, "loss": 0.6122, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5226280689239502, "rewards/margins": 0.8645769953727722, "rewards/rejected": -2.387205123901367, "sft_loss": 1.559669852256775, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 9.143236160600896, "learning_rate": 1.9530263225550765e-06, "logits/chosen": -0.08170856535434723, "logits/rejected": 0.09695029258728027, "logps/chosen": -1.5636448860168457, "logps/rejected": -2.303366184234619, "loss": 0.6796, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5636448860168457, "rewards/margins": 0.7397211790084839, "rewards/rejected": -2.303366184234619, "sft_loss": 1.6848373413085938, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 7.935835067471001, "learning_rate": 1.9485701051940037e-06, "logits/chosen": -0.05064685270190239, "logits/rejected": 0.035549163818359375, "logps/chosen": -1.5907132625579834, "logps/rejected": -2.247149705886841, "loss": 0.6933, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5907132625579834, "rewards/margins": 0.6564362645149231, "rewards/rejected": -2.247149705886841, "sft_loss": 1.6183847188949585, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 13.587108491088857, "learning_rate": 1.9441095358038035e-06, "logits/chosen": 0.04309100657701492, "logits/rejected": 0.22402247786521912, "logps/chosen": -1.639139175415039, "logps/rejected": -2.354151964187622, "loss": 0.6767, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.639139175415039, "rewards/margins": 0.7150126695632935, "rewards/rejected": -2.354151964187622, "sft_loss": 1.6775423288345337, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 8.460133838125511, "learning_rate": 1.9396446576609387e-06, "logits/chosen": 0.08298873901367188, "logits/rejected": 0.17587396502494812, "logps/chosen": -1.6868808269500732, "logps/rejected": -2.4492154121398926, "loss": 0.6663, "rewards/accuracies": 0.75, "rewards/chosen": -1.6868808269500732, "rewards/margins": 0.7623344659805298, "rewards/rejected": -2.4492154121398926, "sft_loss": 1.7489010095596313, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 15.705647021024319, "learning_rate": 1.935175514083677e-06, "logits/chosen": 0.12293801456689835, "logits/rejected": 0.19598381221294403, "logps/chosen": -1.7710424661636353, "logps/rejected": -2.66403865814209, "loss": 0.7171, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7710424661636353, "rewards/margins": 0.8929961919784546, "rewards/rejected": -2.66403865814209, "sft_loss": 1.8463138341903687, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 15.344801126200535, "learning_rate": 1.9307021484316693e-06, "logits/chosen": -0.021174585446715355, "logits/rejected": 0.21488766372203827, "logps/chosen": -1.6551218032836914, "logps/rejected": -2.5742876529693604, "loss": 0.6726, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6551218032836914, "rewards/margins": 0.9191659688949585, "rewards/rejected": -2.5742876529693604, "sft_loss": 1.7234073877334595, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 10.968618406303506, "learning_rate": 1.926224604105529e-06, "logits/chosen": 0.048258017748594284, "logits/rejected": 0.034669678658246994, "logps/chosen": -1.6769109964370728, "logps/rejected": -2.3088769912719727, "loss": 0.7337, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6769109964370728, "rewards/margins": 0.6319661736488342, "rewards/rejected": -2.3088769912719727, "sft_loss": 1.7413244247436523, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 13.51222024151536, "learning_rate": 1.92174292454641e-06, "logits/chosen": 0.0023163154255598783, "logits/rejected": 0.2284691035747528, "logps/chosen": -1.6658554077148438, "logps/rejected": -2.547213554382324, "loss": 0.6688, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6658554077148438, "rewards/margins": 0.8813580274581909, "rewards/rejected": -2.547213554382324, "sft_loss": 1.6388963460922241, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 7.5136948275784965, "learning_rate": 1.917257153235587e-06, "logits/chosen": -0.14208294451236725, "logits/rejected": 0.1281939297914505, "logps/chosen": -1.6982141733169556, "logps/rejected": -2.5021536350250244, "loss": 0.6809, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6982141733169556, "rewards/margins": 0.8039396405220032, "rewards/rejected": -2.5021536350250244, "sft_loss": 1.706189751625061, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 12.495512298700909, "learning_rate": 1.9127673336940335e-06, "logits/chosen": -0.06291162967681885, "logits/rejected": 0.0830179750919342, "logps/chosen": -1.616612434387207, "logps/rejected": -2.4798736572265625, "loss": 0.6679, "rewards/accuracies": 0.75, "rewards/chosen": -1.616612434387207, "rewards/margins": 0.8632608652114868, "rewards/rejected": -2.4798736572265625, "sft_loss": 1.6792612075805664, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 8.40465437215762, "learning_rate": 1.908273509481998e-06, "logits/chosen": 0.00954020582139492, "logits/rejected": 0.1751098483800888, "logps/chosen": -1.7633380889892578, "logps/rejected": -2.5841474533081055, "loss": 0.6787, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7633380889892578, "rewards/margins": 0.8208094835281372, "rewards/rejected": -2.5841474533081055, "sft_loss": 1.7911789417266846, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 11.217731923889714, "learning_rate": 1.9037757241985832e-06, "logits/chosen": -0.03795923292636871, "logits/rejected": 0.07577961683273315, "logps/chosen": -1.6473220586776733, "logps/rejected": -2.5562381744384766, "loss": 0.6378, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6473220586776733, "rewards/margins": 0.9089161157608032, "rewards/rejected": -2.5562381744384766, "sft_loss": 1.6741434335708618, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 9.172700095556124, "learning_rate": 1.899274021481321e-06, "logits/chosen": -0.0980779379606247, "logits/rejected": 0.1461961269378662, "logps/chosen": -1.7238715887069702, "logps/rejected": -2.869298219680786, "loss": 0.6641, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7238715887069702, "rewards/margins": 1.1454265117645264, "rewards/rejected": -2.869298219680786, "sft_loss": 1.7315521240234375, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 13.165880615063447, "learning_rate": 1.8947684450057516e-06, "logits/chosen": 0.014770316891372204, "logits/rejected": 0.26207587122917175, "logps/chosen": -1.6157658100128174, "logps/rejected": -2.5762152671813965, "loss": 0.6117, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6157658100128174, "rewards/margins": 0.9604493379592896, "rewards/rejected": -2.5762152671813965, "sft_loss": 1.691689133644104, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 18.071064934386342, "learning_rate": 1.890259038484997e-06, "logits/chosen": 0.08018441498279572, "logits/rejected": 0.19620773196220398, "logps/chosen": -1.7247365713119507, "logps/rejected": -2.584254741668701, "loss": 0.7137, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7247365713119507, "rewards/margins": 0.8595183491706848, "rewards/rejected": -2.584254741668701, "sft_loss": 1.7407766580581665, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 11.9692190525311, "learning_rate": 1.8857458456693398e-06, "logits/chosen": -0.14878655970096588, "logits/rejected": 0.09677322208881378, "logps/chosen": -1.712206244468689, "logps/rejected": -2.5729756355285645, "loss": 0.6599, "rewards/accuracies": 0.8125, "rewards/chosen": -1.712206244468689, "rewards/margins": 0.8607694506645203, "rewards/rejected": -2.5729756355285645, "sft_loss": 1.8201481103897095, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 10.683630966053455, "learning_rate": 1.881228910345796e-06, "logits/chosen": 0.02798542007803917, "logits/rejected": 0.19053414463996887, "logps/chosen": -1.7972015142440796, "logps/rejected": -2.5500593185424805, "loss": 0.686, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7972015142440796, "rewards/margins": 0.7528579235076904, "rewards/rejected": -2.5500593185424805, "sft_loss": 1.8112598657608032, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 14.183303772598652, "learning_rate": 1.8767082763376916e-06, "logits/chosen": -0.0879894495010376, "logits/rejected": 0.17549023032188416, "logps/chosen": -1.79047429561615, "logps/rejected": -2.5765414237976074, "loss": 0.7087, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.79047429561615, "rewards/margins": 0.7860671281814575, "rewards/rejected": -2.5765414237976074, "sft_loss": 1.6728007793426514, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 19.600034784025812, "learning_rate": 1.8721839875042386e-06, "logits/chosen": -0.04705687612295151, "logits/rejected": 0.2245139628648758, "logps/chosen": -1.707383155822754, "logps/rejected": -2.55816650390625, "loss": 0.6819, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.707383155822754, "rewards/margins": 0.8507832288742065, "rewards/rejected": -2.55816650390625, "sft_loss": 1.7719299793243408, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 9.769090087528488, "learning_rate": 1.8676560877401062e-06, "logits/chosen": -0.0794854611158371, "logits/rejected": 0.29655495285987854, "logps/chosen": -1.6657311916351318, "logps/rejected": -2.5681443214416504, "loss": 0.6365, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6657311916351318, "rewards/margins": 0.9024130702018738, "rewards/rejected": -2.5681443214416504, "sft_loss": 1.7102835178375244, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 17.409084631580033, "learning_rate": 1.8631246209749982e-06, "logits/chosen": -0.1895194798707962, "logits/rejected": 0.25256744027137756, "logps/chosen": -1.6916043758392334, "logps/rejected": -2.622830390930176, "loss": 0.6574, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6916043758392334, "rewards/margins": 0.9312260746955872, "rewards/rejected": -2.622830390930176, "sft_loss": 1.6841598749160767, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 10.378353695060042, "learning_rate": 1.8585896311732247e-06, "logits/chosen": 0.15478946268558502, "logits/rejected": 0.17653344571590424, "logps/chosen": -1.6762475967407227, "logps/rejected": -2.5309839248657227, "loss": 0.6824, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6762475967407227, "rewards/margins": 0.8547362089157104, "rewards/rejected": -2.5309839248657227, "sft_loss": 1.6760101318359375, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 12.61956310328868, "learning_rate": 1.854051162333277e-06, "logits/chosen": 0.02709706500172615, "logits/rejected": 0.3328257203102112, "logps/chosen": -1.6858981847763062, "logps/rejected": -2.506181478500366, "loss": 0.6767, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6858981847763062, "rewards/margins": 0.8202834129333496, "rewards/rejected": -2.506181478500366, "sft_loss": 1.7573429346084595, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 7.289511410035405, "learning_rate": 1.8495092584873992e-06, "logits/chosen": -0.05549658462405205, "logits/rejected": 0.3597896695137024, "logps/chosen": -1.5350788831710815, "logps/rejected": -2.65586519241333, "loss": 0.5958, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5350788831710815, "rewards/margins": 1.1207860708236694, "rewards/rejected": -2.65586519241333, "sft_loss": 1.5353453159332275, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 9.031671007744759, "learning_rate": 1.844963963701163e-06, "logits/chosen": 0.15731653571128845, "logits/rejected": 0.17976097762584686, "logps/chosen": -1.6493396759033203, "logps/rejected": -2.5455853939056396, "loss": 0.6425, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6493396759033203, "rewards/margins": 0.8962458372116089, "rewards/rejected": -2.5455853939056396, "sft_loss": 1.652875542640686, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 7.0803131127486, "learning_rate": 1.8404153220730383e-06, "logits/chosen": -0.05579300969839096, "logits/rejected": 0.12802644073963165, "logps/chosen": -1.57602059841156, "logps/rejected": -2.395585775375366, "loss": 0.6724, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.57602059841156, "rewards/margins": 0.8195652961730957, "rewards/rejected": -2.395585775375366, "sft_loss": 1.6549571752548218, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 9.39056548534232, "learning_rate": 1.8358633777339654e-06, "logits/chosen": 0.07542654126882553, "logits/rejected": 0.2509918808937073, "logps/chosen": -1.641409158706665, "logps/rejected": -2.3671250343322754, "loss": 0.6663, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.641409158706665, "rewards/margins": 0.725716233253479, "rewards/rejected": -2.3671250343322754, "sft_loss": 1.6149475574493408, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 8.775922496769965, "learning_rate": 1.831308174846929e-06, "logits/chosen": 0.039566848427057266, "logits/rejected": 0.22721309959888458, "logps/chosen": -1.639269232749939, "logps/rejected": -2.531994104385376, "loss": 0.6459, "rewards/accuracies": 0.78125, "rewards/chosen": -1.639269232749939, "rewards/margins": 0.892724871635437, "rewards/rejected": -2.531994104385376, "sft_loss": 1.6838048696517944, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 11.858878928388116, "learning_rate": 1.826749757606527e-06, "logits/chosen": 0.053937554359436035, "logits/rejected": 0.36660197377204895, "logps/chosen": -1.6890392303466797, "logps/rejected": -2.7090961933135986, "loss": 0.6462, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6890392303466797, "rewards/margins": 1.0200568437576294, "rewards/rejected": -2.7090961933135986, "sft_loss": 1.7080329656600952, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 7.710103826960804, "learning_rate": 1.8221881702385435e-06, "logits/chosen": 0.023390358313918114, "logits/rejected": 0.3567085266113281, "logps/chosen": -1.600010633468628, "logps/rejected": -2.635223150253296, "loss": 0.6197, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.600010633468628, "rewards/margins": 1.0352122783660889, "rewards/rejected": -2.635223150253296, "sft_loss": 1.7037324905395508, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 14.175525032890683, "learning_rate": 1.8176234569995196e-06, "logits/chosen": 0.0604880228638649, "logits/rejected": 0.25084275007247925, "logps/chosen": -1.7276502847671509, "logps/rejected": -2.919510841369629, "loss": 0.6415, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7276502847671509, "rewards/margins": 1.1918604373931885, "rewards/rejected": -2.919510841369629, "sft_loss": 1.7756588459014893, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 9.28594869278176, "learning_rate": 1.8130556621763223e-06, "logits/chosen": -0.02487350068986416, "logits/rejected": 0.2390095293521881, "logps/chosen": -1.6136411428451538, "logps/rejected": -2.550739288330078, "loss": 0.6491, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6136411428451538, "rewards/margins": 0.9370980262756348, "rewards/rejected": -2.550739288330078, "sft_loss": 1.6545941829681396, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 8.845444980867176, "learning_rate": 1.808484830085718e-06, "logits/chosen": 0.12213625013828278, "logits/rejected": 0.3251255750656128, "logps/chosen": -1.7759946584701538, "logps/rejected": -2.85046124458313, "loss": 0.6402, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7759946584701538, "rewards/margins": 1.0744664669036865, "rewards/rejected": -2.85046124458313, "sft_loss": 1.8285300731658936, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 16.5671755062521, "learning_rate": 1.8039110050739394e-06, "logits/chosen": 0.11437705904245377, "logits/rejected": 0.31481966376304626, "logps/chosen": -1.6738694906234741, "logps/rejected": -2.713113784790039, "loss": 0.6268, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6738694906234741, "rewards/margins": 1.0392444133758545, "rewards/rejected": -2.713113784790039, "sft_loss": 1.7562519311904907, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 9.197601666045005, "learning_rate": 1.7993342315162563e-06, "logits/chosen": -0.05201379582285881, "logits/rejected": 0.29744741320610046, "logps/chosen": -1.7652403116226196, "logps/rejected": -2.9192309379577637, "loss": 0.6118, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7652403116226196, "rewards/margins": 1.1539907455444336, "rewards/rejected": -2.9192309379577637, "sft_loss": 1.8131014108657837, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 8.21473865646731, "learning_rate": 1.794754553816546e-06, "logits/chosen": 0.09198231995105743, "logits/rejected": 0.3146513104438782, "logps/chosen": -1.705095887184143, "logps/rejected": -2.66255521774292, "loss": 0.629, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.705095887184143, "rewards/margins": 0.9574591517448425, "rewards/rejected": -2.66255521774292, "sft_loss": 1.8125232458114624, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 12.512621153535388, "learning_rate": 1.7901720164068623e-06, "logits/chosen": -0.0585850365459919, "logits/rejected": 0.0636616125702858, "logps/chosen": -1.644086480140686, "logps/rejected": -2.435697078704834, "loss": 0.6884, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.644086480140686, "rewards/margins": 0.7916107177734375, "rewards/rejected": -2.435697078704834, "sft_loss": 1.699789047241211, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 12.624327577956587, "learning_rate": 1.7855866637470027e-06, "logits/chosen": 0.05995064973831177, "logits/rejected": 0.19660253822803497, "logps/chosen": -1.6978027820587158, "logps/rejected": -2.7610092163085938, "loss": 0.6434, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6978027820587158, "rewards/margins": 1.0632063150405884, "rewards/rejected": -2.7610092163085938, "sft_loss": 1.748500108718872, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 7.568036513490646, "learning_rate": 1.780998540324079e-06, "logits/chosen": 0.12871405482292175, "logits/rejected": 0.32352548837661743, "logps/chosen": -1.7610708475112915, "logps/rejected": -2.594722032546997, "loss": 0.7147, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7610708475112915, "rewards/margins": 0.833651065826416, "rewards/rejected": -2.594722032546997, "sft_loss": 1.7433416843414307, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 7.421231582880555, "learning_rate": 1.776407690652084e-06, "logits/chosen": 0.012217795476317406, "logits/rejected": 0.24228866398334503, "logps/chosen": -1.7647712230682373, "logps/rejected": -2.819415330886841, "loss": 0.6542, "rewards/accuracies": 0.75, "rewards/chosen": -1.7647712230682373, "rewards/margins": 1.054643988609314, "rewards/rejected": -2.819415330886841, "sft_loss": 1.765459418296814, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 11.134341558803175, "learning_rate": 1.7718141592714628e-06, "logits/chosen": 0.1779087483882904, "logits/rejected": 0.13158239424228668, "logps/chosen": -1.6792852878570557, "logps/rejected": -2.597214698791504, "loss": 0.7041, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6792852878570557, "rewards/margins": 0.9179295301437378, "rewards/rejected": -2.597214698791504, "sft_loss": 1.7933791875839233, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 8.557433582386375, "learning_rate": 1.7672179907486757e-06, "logits/chosen": 0.2755883038043976, "logits/rejected": 0.27128463983535767, "logps/chosen": -1.5790361166000366, "logps/rejected": -2.3961989879608154, "loss": 0.685, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5790361166000366, "rewards/margins": 0.817162811756134, "rewards/rejected": -2.3961989879608154, "sft_loss": 1.6117353439331055, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 11.627721912885242, "learning_rate": 1.7626192296757708e-06, "logits/chosen": 0.1090032309293747, "logits/rejected": 0.20505475997924805, "logps/chosen": -1.6978752613067627, "logps/rejected": -2.478768825531006, "loss": 0.6975, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6978752613067627, "rewards/margins": 0.7808934450149536, "rewards/rejected": -2.478768825531006, "sft_loss": 1.7721240520477295, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": 0.7225221395492554, "eval_logits/rejected": 0.880984365940094, "eval_logps/chosen": -1.7409367561340332, "eval_logps/rejected": -2.4820258617401123, "eval_loss": 0.7278200387954712, "eval_rewards/accuracies": 0.68916916847229, "eval_rewards/chosen": -1.7409367561340332, "eval_rewards/margins": 0.7410891056060791, "eval_rewards/rejected": -2.4820258617401123, "eval_runtime": 44.6007, "eval_samples_per_second": 30.157, "eval_sft_loss": 1.7287054061889648, "eval_steps_per_second": 7.556, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 6.69535209611385, "learning_rate": 1.7580179206699475e-06, "logits/chosen": -0.0975351482629776, "logits/rejected": 0.19071190059185028, "logps/chosen": -1.4848353862762451, "logps/rejected": -2.3775861263275146, "loss": 0.6259, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.4848353862762451, "rewards/margins": 0.8927507400512695, "rewards/rejected": -2.3775861263275146, "sft_loss": 1.5436747074127197, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 13.330593601208003, "learning_rate": 1.7534141083731262e-06, "logits/chosen": 0.13205106556415558, "logits/rejected": 0.27563416957855225, "logps/chosen": -1.6868665218353271, "logps/rejected": -2.556553363800049, "loss": 0.682, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6868665218353271, "rewards/margins": 0.8696869611740112, "rewards/rejected": -2.556553363800049, "sft_loss": 1.7678245306015015, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 8.232942043945219, "learning_rate": 1.7488078374515143e-06, "logits/chosen": 0.19379249215126038, "logits/rejected": 0.3722130060195923, "logps/chosen": -1.672188401222229, "logps/rejected": -2.7497897148132324, "loss": 0.6054, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.672188401222229, "rewards/margins": 1.0776013135910034, "rewards/rejected": -2.7497897148132324, "sft_loss": 1.6930681467056274, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 9.209473289799588, "learning_rate": 1.7441991525951722e-06, "logits/chosen": 0.056553877890110016, "logits/rejected": 0.4158708453178406, "logps/chosen": -1.71318781375885, "logps/rejected": -2.620293617248535, "loss": 0.6823, "rewards/accuracies": 0.71875, "rewards/chosen": -1.71318781375885, "rewards/margins": 0.9071057438850403, "rewards/rejected": -2.620293617248535, "sft_loss": 1.7801010608673096, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 20.748635995851046, "learning_rate": 1.7395880985175808e-06, "logits/chosen": -0.02775495871901512, "logits/rejected": 0.28328195214271545, "logps/chosen": -1.798081398010254, "logps/rejected": -2.9397964477539062, "loss": 0.6369, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.798081398010254, "rewards/margins": 1.141715168952942, "rewards/rejected": -2.9397964477539062, "sft_loss": 1.7914934158325195, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 9.5241745735613, "learning_rate": 1.7349747199552063e-06, "logits/chosen": 0.16219733655452728, "logits/rejected": 0.358007550239563, "logps/chosen": -1.6569722890853882, "logps/rejected": -2.6283373832702637, "loss": 0.634, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6569722890853882, "rewards/margins": 0.9713649749755859, "rewards/rejected": -2.6283373832702637, "sft_loss": 1.7422775030136108, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 11.82818399467519, "learning_rate": 1.7303590616670683e-06, "logits/chosen": 0.022676551714539528, "logits/rejected": 0.27947741746902466, "logps/chosen": -1.7626409530639648, "logps/rejected": -2.8061025142669678, "loss": 0.6341, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7626409530639648, "rewards/margins": 1.043461799621582, "rewards/rejected": -2.8061025142669678, "sft_loss": 1.8145999908447266, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 12.272501099504066, "learning_rate": 1.7257411684343042e-06, "logits/chosen": 0.01731787994503975, "logits/rejected": 0.1518246829509735, "logps/chosen": -1.7838151454925537, "logps/rejected": -2.5523946285247803, "loss": 0.7211, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7838151454925537, "rewards/margins": 0.768579363822937, "rewards/rejected": -2.5523946285247803, "sft_loss": 1.8348591327667236, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 8.989420292425729, "learning_rate": 1.7211210850597333e-06, "logits/chosen": 0.01916094496846199, "logits/rejected": 0.18893815577030182, "logps/chosen": -1.8312240839004517, "logps/rejected": -2.7193756103515625, "loss": 0.7274, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8312240839004517, "rewards/margins": 0.8881517648696899, "rewards/rejected": -2.7193756103515625, "sft_loss": 1.772420883178711, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 9.066222177106056, "learning_rate": 1.7164988563674256e-06, "logits/chosen": -0.017062615603208542, "logits/rejected": 0.14022260904312134, "logps/chosen": -1.7420536279678345, "logps/rejected": -2.8658266067504883, "loss": 0.6622, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7420536279678345, "rewards/margins": 1.1237730979919434, "rewards/rejected": -2.8658266067504883, "sft_loss": 1.751794457435608, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 11.820725471020838, "learning_rate": 1.7118745272022635e-06, "logits/chosen": -0.041482336819171906, "logits/rejected": 0.24770434200763702, "logps/chosen": -1.812748908996582, "logps/rejected": -2.7241785526275635, "loss": 0.6673, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.812748908996582, "rewards/margins": 0.9114295840263367, "rewards/rejected": -2.7241785526275635, "sft_loss": 1.8595969676971436, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 9.634310646961953, "learning_rate": 1.7072481424295097e-06, "logits/chosen": -0.1236807107925415, "logits/rejected": 0.18752439320087433, "logps/chosen": -1.664668083190918, "logps/rejected": -2.4004406929016113, "loss": 0.6743, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.664668083190918, "rewards/margins": 0.7357724905014038, "rewards/rejected": -2.4004406929016113, "sft_loss": 1.6978073120117188, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 9.221992342972545, "learning_rate": 1.702619746934369e-06, "logits/chosen": -0.14590224623680115, "logits/rejected": 0.09175385534763336, "logps/chosen": -1.7224514484405518, "logps/rejected": -2.482299327850342, "loss": 0.7081, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7224514484405518, "rewards/margins": 0.7598481178283691, "rewards/rejected": -2.482299327850342, "sft_loss": 1.7601163387298584, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 11.372601623636282, "learning_rate": 1.6979893856215547e-06, "logits/chosen": -0.02332393266260624, "logits/rejected": 0.1746397167444229, "logps/chosen": -1.6565109491348267, "logps/rejected": -2.316208839416504, "loss": 0.6962, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6565109491348267, "rewards/margins": 0.6596980094909668, "rewards/rejected": -2.316208839416504, "sft_loss": 1.6300147771835327, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 10.141313724386336, "learning_rate": 1.6933571034148531e-06, "logits/chosen": -0.003986936993896961, "logits/rejected": 0.17912557721138, "logps/chosen": -1.6668100357055664, "logps/rejected": -2.422069787979126, "loss": 0.652, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6668100357055664, "rewards/margins": 0.75525963306427, "rewards/rejected": -2.422069787979126, "sft_loss": 1.6519018411636353, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 11.36954785563402, "learning_rate": 1.6887229452566859e-06, "logits/chosen": 0.14806845784187317, "logits/rejected": 0.31873518228530884, "logps/chosen": -1.5642637014389038, "logps/rejected": -2.630958080291748, "loss": 0.6148, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5642637014389038, "rewards/margins": 1.0666944980621338, "rewards/rejected": -2.630958080291748, "sft_loss": 1.6302076578140259, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 16.28888372734567, "learning_rate": 1.6840869561076761e-06, "logits/chosen": -0.03490322828292847, "logits/rejected": 0.1576215922832489, "logps/chosen": -1.6812372207641602, "logps/rejected": -2.6249706745147705, "loss": 0.6642, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6812372207641602, "rewards/margins": 0.9437335133552551, "rewards/rejected": -2.6249706745147705, "sft_loss": 1.7672827243804932, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 9.777297758031644, "learning_rate": 1.6794491809462108e-06, "logits/chosen": -0.05005021020770073, "logits/rejected": 0.26095980405807495, "logps/chosen": -1.705004096031189, "logps/rejected": -2.7471156120300293, "loss": 0.6291, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.705004096031189, "rewards/margins": 1.0421111583709717, "rewards/rejected": -2.7471156120300293, "sft_loss": 1.715197205543518, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 8.10937142440469, "learning_rate": 1.674809664768005e-06, "logits/chosen": -0.06505431979894638, "logits/rejected": 0.19787968695163727, "logps/chosen": -1.6374775171279907, "logps/rejected": -2.6413464546203613, "loss": 0.6291, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6374775171279907, "rewards/margins": 1.0038686990737915, "rewards/rejected": -2.6413464546203613, "sft_loss": 1.6606299877166748, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 11.073727655098075, "learning_rate": 1.6701684525856647e-06, "logits/chosen": 0.05843139812350273, "logits/rejected": 0.21685531735420227, "logps/chosen": -1.689962387084961, "logps/rejected": -2.689544439315796, "loss": 0.6346, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.689962387084961, "rewards/margins": 0.9995821118354797, "rewards/rejected": -2.689544439315796, "sft_loss": 1.7143598794937134, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 21.911674709660502, "learning_rate": 1.6655255894282515e-06, "logits/chosen": 0.13442903757095337, "logits/rejected": 0.1398596167564392, "logps/chosen": -1.7459805011749268, "logps/rejected": -2.8175759315490723, "loss": 0.6528, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7459805011749268, "rewards/margins": 1.0715951919555664, "rewards/rejected": -2.8175759315490723, "sft_loss": 1.777316689491272, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 16.44153112785128, "learning_rate": 1.6608811203408437e-06, "logits/chosen": -0.0002602711319923401, "logits/rejected": 0.17666465044021606, "logps/chosen": -1.6980969905853271, "logps/rejected": -2.5463755130767822, "loss": 0.6795, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6980969905853271, "rewards/margins": 0.8482787013053894, "rewards/rejected": -2.5463755130767822, "sft_loss": 1.7703033685684204, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 17.147443377069976, "learning_rate": 1.6562350903841002e-06, "logits/chosen": 0.09378467500209808, "logits/rejected": 0.40250760316848755, "logps/chosen": -1.775538682937622, "logps/rejected": -2.9476616382598877, "loss": 0.6534, "rewards/accuracies": 0.75, "rewards/chosen": -1.775538682937622, "rewards/margins": 1.172122836112976, "rewards/rejected": -2.9476616382598877, "sft_loss": 1.8604596853256226, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 20.611801792196648, "learning_rate": 1.651587544633825e-06, "logits/chosen": 0.048998866230249405, "logits/rejected": 0.2327841818332672, "logps/chosen": -1.7560018301010132, "logps/rejected": -2.8011298179626465, "loss": 0.6707, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7560018301010132, "rewards/margins": 1.0451281070709229, "rewards/rejected": -2.8011298179626465, "sft_loss": 1.8246772289276123, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 12.615014844831247, "learning_rate": 1.6469385281805267e-06, "logits/chosen": 0.06845826655626297, "logits/rejected": 0.2191547155380249, "logps/chosen": -1.6676260232925415, "logps/rejected": -2.61234188079834, "loss": 0.6907, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6676260232925415, "rewards/margins": 0.9447160959243774, "rewards/rejected": -2.61234188079834, "sft_loss": 1.7090753316879272, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 13.437188454899843, "learning_rate": 1.642288086128984e-06, "logits/chosen": -0.11175362765789032, "logits/rejected": 0.15200158953666687, "logps/chosen": -1.580471396446228, "logps/rejected": -2.7887420654296875, "loss": 0.64, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.580471396446228, "rewards/margins": 1.2082706689834595, "rewards/rejected": -2.7887420654296875, "sft_loss": 1.6918761730194092, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 13.49614515718196, "learning_rate": 1.6376362635978055e-06, "logits/chosen": -0.06129909306764603, "logits/rejected": 0.14012238383293152, "logps/chosen": -1.6711757183074951, "logps/rejected": -2.4994843006134033, "loss": 0.6692, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6711757183074951, "rewards/margins": 0.8283087015151978, "rewards/rejected": -2.4994843006134033, "sft_loss": 1.7221336364746094, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 9.791108225155257, "learning_rate": 1.6329831057189936e-06, "logits/chosen": -0.08597782999277115, "logits/rejected": 0.14627881348133087, "logps/chosen": -1.6221815347671509, "logps/rejected": -2.6455671787261963, "loss": 0.6517, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6221815347671509, "rewards/margins": 1.023385763168335, "rewards/rejected": -2.6455671787261963, "sft_loss": 1.6977074146270752, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 14.082836241989783, "learning_rate": 1.6283286576375069e-06, "logits/chosen": -0.06954207271337509, "logits/rejected": 0.11793907731771469, "logps/chosen": -1.6429466009140015, "logps/rejected": -2.4059674739837646, "loss": 0.678, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6429466009140015, "rewards/margins": 0.7630206942558289, "rewards/rejected": -2.4059674739837646, "sft_loss": 1.658942461013794, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 16.784536910984507, "learning_rate": 1.623672964510821e-06, "logits/chosen": -0.05539491027593613, "logits/rejected": 0.33161354064941406, "logps/chosen": -1.6232349872589111, "logps/rejected": -2.6007208824157715, "loss": 0.6308, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6232349872589111, "rewards/margins": 0.9774861335754395, "rewards/rejected": -2.6007208824157715, "sft_loss": 1.6661285161972046, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 13.008739717837097, "learning_rate": 1.6190160715084909e-06, "logits/chosen": -0.0022641464602202177, "logits/rejected": 0.18418416380882263, "logps/chosen": -1.7134850025177002, "logps/rejected": -2.587226390838623, "loss": 0.6859, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7134850025177002, "rewards/margins": 0.873741626739502, "rewards/rejected": -2.587226390838623, "sft_loss": 1.8100296258926392, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 6.973299002605783, "learning_rate": 1.6143580238117132e-06, "logits/chosen": -0.13283856213092804, "logits/rejected": 0.06323808431625366, "logps/chosen": -1.6416757106781006, "logps/rejected": -2.5051159858703613, "loss": 0.6403, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6416757106781006, "rewards/margins": 0.8634401559829712, "rewards/rejected": -2.5051159858703613, "sft_loss": 1.6961336135864258, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 9.975646896565582, "learning_rate": 1.6096988666128867e-06, "logits/chosen": -0.08490502089262009, "logits/rejected": 0.07857394218444824, "logps/chosen": -1.6207164525985718, "logps/rejected": -2.5260729789733887, "loss": 0.6522, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6207164525985718, "rewards/margins": 0.9053562879562378, "rewards/rejected": -2.5260729789733887, "sft_loss": 1.631158471107483, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 11.374496403588678, "learning_rate": 1.6050386451151753e-06, "logits/chosen": -0.11497664451599121, "logits/rejected": 0.17683936655521393, "logps/chosen": -1.729366660118103, "logps/rejected": -2.531245708465576, "loss": 0.6987, "rewards/accuracies": 0.75, "rewards/chosen": -1.729366660118103, "rewards/margins": 0.8018789291381836, "rewards/rejected": -2.531245708465576, "sft_loss": 1.7939300537109375, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 9.7540309383818, "learning_rate": 1.6003774045320686e-06, "logits/chosen": -0.052069902420043945, "logits/rejected": 0.1832505762577057, "logps/chosen": -1.6778125762939453, "logps/rejected": -2.5040578842163086, "loss": 0.6592, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6778125762939453, "rewards/margins": 0.8262453079223633, "rewards/rejected": -2.5040578842163086, "sft_loss": 1.7541424036026, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 9.26386326156349, "learning_rate": 1.5957151900869425e-06, "logits/chosen": -0.1480114758014679, "logits/rejected": 0.12639665603637695, "logps/chosen": -1.7525148391723633, "logps/rejected": -2.531358242034912, "loss": 0.6629, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7525148391723633, "rewards/margins": 0.7788435816764832, "rewards/rejected": -2.531358242034912, "sft_loss": 1.7702747583389282, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 11.504689539648137, "learning_rate": 1.5910520470126228e-06, "logits/chosen": -0.07063053548336029, "logits/rejected": 0.24231629073619843, "logps/chosen": -1.7522459030151367, "logps/rejected": -2.680307626724243, "loss": 0.6822, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7522459030151367, "rewards/margins": 0.9280616641044617, "rewards/rejected": -2.680307626724243, "sft_loss": 1.7290763854980469, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 13.933657904347603, "learning_rate": 1.5863880205509432e-06, "logits/chosen": -0.162372887134552, "logits/rejected": 0.15010753273963928, "logps/chosen": -1.6161205768585205, "logps/rejected": -2.57654070854187, "loss": 0.6338, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6161205768585205, "rewards/margins": 0.9604201316833496, "rewards/rejected": -2.57654070854187, "sft_loss": 1.6565601825714111, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 12.510482806484635, "learning_rate": 1.5817231559523097e-06, "logits/chosen": -0.03692112863063812, "logits/rejected": 0.1306377351284027, "logps/chosen": -1.7245012521743774, "logps/rejected": -2.869750499725342, "loss": 0.6361, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7245012521743774, "rewards/margins": 1.145249605178833, "rewards/rejected": -2.869750499725342, "sft_loss": 1.8045265674591064, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 6.8442139916725955, "learning_rate": 1.5770574984752582e-06, "logits/chosen": -0.10571374744176865, "logits/rejected": 0.183961883187294, "logps/chosen": -1.7865946292877197, "logps/rejected": -2.5776772499084473, "loss": 0.7157, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7865946292877197, "rewards/margins": 0.7910826802253723, "rewards/rejected": -2.5776772499084473, "sft_loss": 1.7589298486709595, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 20.445526107029675, "learning_rate": 1.5723910933860191e-06, "logits/chosen": -0.24091443419456482, "logits/rejected": 0.02261008694767952, "logps/chosen": -1.7237069606781006, "logps/rejected": -2.4907679557800293, "loss": 0.6989, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7237069606781006, "rewards/margins": 0.7670608758926392, "rewards/rejected": -2.4907679557800293, "sft_loss": 1.7233402729034424, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 12.619053572340551, "learning_rate": 1.5677239859580742e-06, "logits/chosen": -0.21961793303489685, "logits/rejected": -0.012505131773650646, "logps/chosen": -1.6292585134506226, "logps/rejected": -2.373121500015259, "loss": 0.7005, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6292585134506226, "rewards/margins": 0.7438629865646362, "rewards/rejected": -2.373121500015259, "sft_loss": 1.6691501140594482, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 11.904233827176936, "learning_rate": 1.5630562214717205e-06, "logits/chosen": 0.12450895458459854, "logits/rejected": 0.23923444747924805, "logps/chosen": -1.7282705307006836, "logps/rejected": -2.4345037937164307, "loss": 0.6853, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7282705307006836, "rewards/margins": 0.7062332630157471, "rewards/rejected": -2.4345037937164307, "sft_loss": 1.7057040929794312, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 9.554738287588028, "learning_rate": 1.5583878452136296e-06, "logits/chosen": -0.14793828129768372, "logits/rejected": 0.06307663023471832, "logps/chosen": -1.6740642786026, "logps/rejected": -2.3620121479034424, "loss": 0.6848, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6740642786026, "rewards/margins": 0.6879477500915527, "rewards/rejected": -2.3620121479034424, "sft_loss": 1.7369064092636108, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 9.06908807591873, "learning_rate": 1.5537189024764086e-06, "logits/chosen": -0.09235244989395142, "logits/rejected": 0.16227707266807556, "logps/chosen": -1.6051900386810303, "logps/rejected": -2.3181235790252686, "loss": 0.6849, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6051900386810303, "rewards/margins": 0.712933361530304, "rewards/rejected": -2.3181235790252686, "sft_loss": 1.6950048208236694, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 8.173684180423933, "learning_rate": 1.5490494385581599e-06, "logits/chosen": -0.041692376136779785, "logits/rejected": 0.19014063477516174, "logps/chosen": -1.721787452697754, "logps/rejected": -2.479731559753418, "loss": 0.69, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.721787452697754, "rewards/margins": 0.7579439878463745, "rewards/rejected": -2.479731559753418, "sft_loss": 1.7510948181152344, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 11.774384400783354, "learning_rate": 1.5443794987620433e-06, "logits/chosen": 0.039117418229579926, "logits/rejected": 0.2529299855232239, "logps/chosen": -1.6293704509735107, "logps/rejected": -2.246772289276123, "loss": 0.6823, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6293704509735107, "rewards/margins": 0.6174014806747437, "rewards/rejected": -2.246772289276123, "sft_loss": 1.6525089740753174, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 8.21281256966865, "learning_rate": 1.539709128395835e-06, "logits/chosen": -0.022275418043136597, "logits/rejected": 0.078637033700943, "logps/chosen": -1.5396145582199097, "logps/rejected": -2.624133586883545, "loss": 0.6204, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5396145582199097, "rewards/margins": 1.0845190286636353, "rewards/rejected": -2.624133586883545, "sft_loss": 1.617453932762146, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 19.586850922708145, "learning_rate": 1.5350383727714888e-06, "logits/chosen": -0.01572037860751152, "logits/rejected": 0.11370836198329926, "logps/chosen": -1.6482702493667603, "logps/rejected": -2.335761547088623, "loss": 0.7206, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6482702493667603, "rewards/margins": 0.6874914169311523, "rewards/rejected": -2.335761547088623, "sft_loss": 1.6958271265029907, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 10.801187419105617, "learning_rate": 1.5303672772046963e-06, "logits/chosen": -0.11493013054132462, "logits/rejected": 0.08882128447294235, "logps/chosen": -1.7629035711288452, "logps/rejected": -2.957852840423584, "loss": 0.5951, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7629035711288452, "rewards/margins": 1.1949495077133179, "rewards/rejected": -2.957852840423584, "sft_loss": 1.8507661819458008, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 10.842179095103713, "learning_rate": 1.525695887014447e-06, "logits/chosen": -0.14992229640483856, "logits/rejected": 0.1002429947257042, "logps/chosen": -1.8385114669799805, "logps/rejected": -2.759986639022827, "loss": 0.6714, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8385114669799805, "rewards/margins": 0.9214752912521362, "rewards/rejected": -2.759986639022827, "sft_loss": 1.8441314697265625, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 8.295916061287276, "learning_rate": 1.5210242475225896e-06, "logits/chosen": -0.07416633516550064, "logits/rejected": 0.20603366196155548, "logps/chosen": -1.7949146032333374, "logps/rejected": -2.755343198776245, "loss": 0.6977, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7949146032333374, "rewards/margins": 0.9604288935661316, "rewards/rejected": -2.755343198776245, "sft_loss": 1.8887898921966553, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 16.52306270227895, "learning_rate": 1.5163524040533903e-06, "logits/chosen": 0.057475216686725616, "logits/rejected": 0.17760422825813293, "logps/chosen": -1.803520917892456, "logps/rejected": -2.734199285507202, "loss": 0.6897, "rewards/accuracies": 0.71875, "rewards/chosen": -1.803520917892456, "rewards/margins": 0.9306782484054565, "rewards/rejected": -2.734199285507202, "sft_loss": 1.8802194595336914, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 10.33295629864258, "learning_rate": 1.5116804019330951e-06, "logits/chosen": -0.07347237318754196, "logits/rejected": 0.10067732632160187, "logps/chosen": -1.736215353012085, "logps/rejected": -2.6383612155914307, "loss": 0.6816, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.736215353012085, "rewards/margins": 0.9021456837654114, "rewards/rejected": -2.6383612155914307, "sft_loss": 1.8136123418807983, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 8.405140276522918, "learning_rate": 1.5070082864894892e-06, "logits/chosen": -0.12049313634634018, "logits/rejected": -0.0011329979170113802, "logps/chosen": -1.590718150138855, "logps/rejected": -2.4013075828552246, "loss": 0.637, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.590718150138855, "rewards/margins": 0.8105891942977905, "rewards/rejected": -2.4013075828552246, "sft_loss": 1.5965020656585693, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 8.034046778505555, "learning_rate": 1.5023361030514572e-06, "logits/chosen": -0.15744206309318542, "logits/rejected": 0.1334286630153656, "logps/chosen": -1.4672434329986572, "logps/rejected": -2.358227491378784, "loss": 0.6205, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4672434329986572, "rewards/margins": 0.890984058380127, "rewards/rejected": -2.358227491378784, "sft_loss": 1.5532448291778564, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 9.347068597730125, "learning_rate": 1.4976638969485433e-06, "logits/chosen": 0.07527122646570206, "logits/rejected": 0.10903845727443695, "logps/chosen": -1.618255615234375, "logps/rejected": -2.4998650550842285, "loss": 0.6589, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.618255615234375, "rewards/margins": 0.8816096186637878, "rewards/rejected": -2.4998650550842285, "sft_loss": 1.666682243347168, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 13.857367394301217, "learning_rate": 1.492991713510511e-06, "logits/chosen": 0.0880415141582489, "logits/rejected": 0.20547275245189667, "logps/chosen": -1.6247081756591797, "logps/rejected": -2.3860676288604736, "loss": 0.7297, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6247081756591797, "rewards/margins": 0.761359453201294, "rewards/rejected": -2.3860676288604736, "sft_loss": 1.6987422704696655, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 10.965157699832295, "learning_rate": 1.4883195980669052e-06, "logits/chosen": 0.04823315888643265, "logits/rejected": 0.30748096108436584, "logps/chosen": -1.6990811824798584, "logps/rejected": -2.5591132640838623, "loss": 0.6654, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6990811824798584, "rewards/margins": 0.8600322008132935, "rewards/rejected": -2.5591132640838623, "sft_loss": 1.6902282238006592, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 11.246003054270304, "learning_rate": 1.48364759594661e-06, "logits/chosen": -0.09684957563877106, "logits/rejected": 0.09521036595106125, "logps/chosen": -1.6307718753814697, "logps/rejected": -2.4601705074310303, "loss": 0.6574, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6307718753814697, "rewards/margins": 0.8293987512588501, "rewards/rejected": -2.4601705074310303, "sft_loss": 1.7225332260131836, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 16.764918687679643, "learning_rate": 1.4789757524774105e-06, "logits/chosen": -0.12152546644210815, "logits/rejected": 0.18736448884010315, "logps/chosen": -1.7070300579071045, "logps/rejected": -2.435864210128784, "loss": 0.7022, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7070300579071045, "rewards/margins": 0.7288340330123901, "rewards/rejected": -2.435864210128784, "sft_loss": 1.7718076705932617, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 9.033133052675327, "learning_rate": 1.474304112985553e-06, "logits/chosen": -0.023256715387105942, "logits/rejected": 0.186649352312088, "logps/chosen": -1.6628717184066772, "logps/rejected": -2.6158218383789062, "loss": 0.6279, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6628717184066772, "rewards/margins": 0.9529500007629395, "rewards/rejected": -2.6158218383789062, "sft_loss": 1.6220327615737915, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 12.038268695656548, "learning_rate": 1.469632722795304e-06, "logits/chosen": 0.042193703353405, "logits/rejected": 0.22168488800525665, "logps/chosen": -1.6942789554595947, "logps/rejected": -2.6612579822540283, "loss": 0.6318, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6942789554595947, "rewards/margins": 0.9669791460037231, "rewards/rejected": -2.6612579822540283, "sft_loss": 1.7724997997283936, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 8.128194965079711, "learning_rate": 1.4649616272285115e-06, "logits/chosen": -0.13571251928806305, "logits/rejected": 0.12857648730278015, "logps/chosen": -1.7519333362579346, "logps/rejected": -2.70003080368042, "loss": 0.6764, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7519333362579346, "rewards/margins": 0.9480972290039062, "rewards/rejected": -2.70003080368042, "sft_loss": 1.7731482982635498, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 9.886863369863011, "learning_rate": 1.4602908716041651e-06, "logits/chosen": -0.0866248831152916, "logits/rejected": 0.12165629863739014, "logps/chosen": -1.9829012155532837, "logps/rejected": -2.9655168056488037, "loss": 0.6768, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9829012155532837, "rewards/margins": 0.9826154708862305, "rewards/rejected": -2.9655168056488037, "sft_loss": 1.8548479080200195, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 10.09112548492932, "learning_rate": 1.4556205012379568e-06, "logits/chosen": -0.02071903459727764, "logits/rejected": 0.2770634889602661, "logps/chosen": -1.8720626831054688, "logps/rejected": -2.761143922805786, "loss": 0.6717, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8720626831054688, "rewards/margins": 0.8890812993049622, "rewards/rejected": -2.761143922805786, "sft_loss": 1.9153035879135132, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 13.63668991969161, "learning_rate": 1.4509505614418402e-06, "logits/chosen": 0.001990280346944928, "logits/rejected": 0.155452698469162, "logps/chosen": -1.950551986694336, "logps/rejected": -2.8099985122680664, "loss": 0.6996, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.950551986694336, "rewards/margins": 0.85944664478302, "rewards/rejected": -2.8099985122680664, "sft_loss": 1.9364144802093506, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 9.391206321349692, "learning_rate": 1.4462810975235915e-06, "logits/chosen": -0.29772359132766724, "logits/rejected": -0.08676508814096451, "logps/chosen": -1.6872972249984741, "logps/rejected": -2.605922222137451, "loss": 0.6559, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6872972249984741, "rewards/margins": 0.9186250567436218, "rewards/rejected": -2.605922222137451, "sft_loss": 1.749237060546875, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 8.542656017395316, "learning_rate": 1.4416121547863703e-06, "logits/chosen": 0.013849747367203236, "logits/rejected": 0.23345601558685303, "logps/chosen": -1.7267534732818604, "logps/rejected": -2.6541545391082764, "loss": 0.6792, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7267534732818604, "rewards/margins": 0.927401065826416, "rewards/rejected": -2.6541545391082764, "sft_loss": 1.7446855306625366, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 10.312933253800834, "learning_rate": 1.4369437785282794e-06, "logits/chosen": -0.17418628931045532, "logits/rejected": 0.011349151842296124, "logps/chosen": -1.736092209815979, "logps/rejected": -2.6571297645568848, "loss": 0.6453, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.736092209815979, "rewards/margins": 0.9210373163223267, "rewards/rejected": -2.6571297645568848, "sft_loss": 1.7266775369644165, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 9.714136663542105, "learning_rate": 1.4322760140419259e-06, "logits/chosen": -0.17709819972515106, "logits/rejected": -0.012431099079549313, "logps/chosen": -1.5618194341659546, "logps/rejected": -2.543933153152466, "loss": 0.6422, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5618194341659546, "rewards/margins": 0.9821133613586426, "rewards/rejected": -2.543933153152466, "sft_loss": 1.5918623208999634, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 15.467849504617234, "learning_rate": 1.427608906613981e-06, "logits/chosen": -0.010245876386761665, "logits/rejected": -0.06519723683595657, "logps/chosen": -1.7043612003326416, "logps/rejected": -2.712526559829712, "loss": 0.6457, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7043612003326416, "rewards/margins": 1.0081654787063599, "rewards/rejected": -2.712526559829712, "sft_loss": 1.843871831893921, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 11.24888997921364, "learning_rate": 1.4229425015247414e-06, "logits/chosen": -0.26887112855911255, "logits/rejected": -0.08024895191192627, "logps/chosen": -1.7478374242782593, "logps/rejected": -2.45278263092041, "loss": 0.7308, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7478374242782593, "rewards/margins": 0.7049452662467957, "rewards/rejected": -2.45278263092041, "sft_loss": 1.819427490234375, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 10.239901554975248, "learning_rate": 1.4182768440476904e-06, "logits/chosen": -0.13010664284229279, "logits/rejected": 0.025120923295617104, "logps/chosen": -1.6842975616455078, "logps/rejected": -2.596648693084717, "loss": 0.6478, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6842975616455078, "rewards/margins": 0.912351131439209, "rewards/rejected": -2.596648693084717, "sft_loss": 1.7377408742904663, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 12.921080947407273, "learning_rate": 1.4136119794490567e-06, "logits/chosen": -0.15024778246879578, "logits/rejected": 0.0847291350364685, "logps/chosen": -1.7774083614349365, "logps/rejected": -2.4142112731933594, "loss": 0.7723, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7774083614349365, "rewards/margins": 0.6368028521537781, "rewards/rejected": -2.4142112731933594, "sft_loss": 1.8198562860488892, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 10.081702354572228, "learning_rate": 1.4089479529873773e-06, "logits/chosen": 0.01988459937274456, "logits/rejected": 0.10248573869466782, "logps/chosen": -1.7284653186798096, "logps/rejected": -2.532827615737915, "loss": 0.6867, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7284653186798096, "rewards/margins": 0.804362416267395, "rewards/rejected": -2.532827615737915, "sft_loss": 1.7420072555541992, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 12.41462553686139, "learning_rate": 1.4042848099130574e-06, "logits/chosen": 0.02473408542573452, "logits/rejected": 0.09276667982339859, "logps/chosen": -1.6429874897003174, "logps/rejected": -2.2225372791290283, "loss": 0.7315, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6429874897003174, "rewards/margins": 0.5795496702194214, "rewards/rejected": -2.2225372791290283, "sft_loss": 1.7064769268035889, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 7.7584792999715795, "learning_rate": 1.3996225954679317e-06, "logits/chosen": -0.1788342446088791, "logits/rejected": 0.09930647909641266, "logps/chosen": -1.5633602142333984, "logps/rejected": -2.3918919563293457, "loss": 0.6304, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5633602142333984, "rewards/margins": 0.8285319209098816, "rewards/rejected": -2.3918919563293457, "sft_loss": 1.564711332321167, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 11.94172789929859, "learning_rate": 1.3949613548848248e-06, "logits/chosen": -0.1356816589832306, "logits/rejected": 0.039621032774448395, "logps/chosen": -1.5667574405670166, "logps/rejected": -2.434279441833496, "loss": 0.6524, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5667574405670166, "rewards/margins": 0.8675218820571899, "rewards/rejected": -2.434279441833496, "sft_loss": 1.5231401920318604, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 11.96012738340361, "learning_rate": 1.3903011333871134e-06, "logits/chosen": -0.0068131862208247185, "logits/rejected": 0.2507849633693695, "logps/chosen": -1.7056745290756226, "logps/rejected": -2.4680447578430176, "loss": 0.7081, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7056745290756226, "rewards/margins": 0.7623699307441711, "rewards/rejected": -2.4680447578430176, "sft_loss": 1.7112070322036743, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": 0.44144490361213684, "eval_logits/rejected": 0.573809802532196, "eval_logps/chosen": -1.691447138786316, "eval_logps/rejected": -2.342031240463257, "eval_loss": 0.727591872215271, "eval_rewards/accuracies": 0.6772996783256531, "eval_rewards/chosen": -1.691447138786316, "eval_rewards/margins": 0.6505837440490723, "eval_rewards/rejected": -2.342031240463257, "eval_runtime": 44.3817, "eval_samples_per_second": 30.305, "eval_sft_loss": 1.6777325868606567, "eval_steps_per_second": 7.593, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 8.07626648371608, "learning_rate": 1.3856419761882875e-06, "logits/chosen": -0.14924605190753937, "logits/rejected": 0.030980080366134644, "logps/chosen": -1.6322886943817139, "logps/rejected": -2.489731550216675, "loss": 0.6552, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6322886943817139, "rewards/margins": 0.8574431538581848, "rewards/rejected": -2.489731550216675, "sft_loss": 1.647454023361206, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 10.278073443422485, "learning_rate": 1.3809839284915096e-06, "logits/chosen": -0.12130733579397202, "logits/rejected": 0.05340119078755379, "logps/chosen": -1.6221942901611328, "logps/rejected": -2.3026764392852783, "loss": 0.7143, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6221942901611328, "rewards/margins": 0.680482029914856, "rewards/rejected": -2.3026764392852783, "sft_loss": 1.6290735006332397, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 10.784190354520463, "learning_rate": 1.3763270354891795e-06, "logits/chosen": -0.05685758590698242, "logits/rejected": 0.1100848913192749, "logps/chosen": -1.6883134841918945, "logps/rejected": -2.5045347213745117, "loss": 0.6823, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6883134841918945, "rewards/margins": 0.8162212371826172, "rewards/rejected": -2.5045347213745117, "sft_loss": 1.6928250789642334, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 10.191700276852643, "learning_rate": 1.3716713423624936e-06, "logits/chosen": -0.14868003129959106, "logits/rejected": 0.2683381140232086, "logps/chosen": -1.8373911380767822, "logps/rejected": -2.7939047813415527, "loss": 0.7222, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8373911380767822, "rewards/margins": 0.9565135836601257, "rewards/rejected": -2.7939047813415527, "sft_loss": 1.7662341594696045, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 9.249713190171791, "learning_rate": 1.367016894281007e-06, "logits/chosen": -0.09739672392606735, "logits/rejected": 0.10374144464731216, "logps/chosen": -1.5907642841339111, "logps/rejected": -2.488351345062256, "loss": 0.639, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5907642841339111, "rewards/margins": 0.8975872993469238, "rewards/rejected": -2.488351345062256, "sft_loss": 1.6401208639144897, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 19.03202453774214, "learning_rate": 1.3623637364021952e-06, "logits/chosen": -0.17357507348060608, "logits/rejected": 0.0912250429391861, "logps/chosen": -1.7484121322631836, "logps/rejected": -2.9147703647613525, "loss": 0.6135, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7484121322631836, "rewards/margins": 1.1663583517074585, "rewards/rejected": -2.9147703647613525, "sft_loss": 1.764880895614624, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 10.605975138159263, "learning_rate": 1.3577119138710165e-06, "logits/chosen": -0.20004332065582275, "logits/rejected": -0.06560181826353073, "logps/chosen": -1.7282603979110718, "logps/rejected": -2.6194205284118652, "loss": 0.6598, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7282603979110718, "rewards/margins": 0.8911601305007935, "rewards/rejected": -2.6194205284118652, "sft_loss": 1.7524601221084595, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 9.412493447124648, "learning_rate": 1.3530614718194734e-06, "logits/chosen": -0.08897742629051208, "logits/rejected": 0.1184593215584755, "logps/chosen": -1.6912317276000977, "logps/rejected": -2.774104595184326, "loss": 0.6417, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6912317276000977, "rewards/margins": 1.082872748374939, "rewards/rejected": -2.774104595184326, "sft_loss": 1.6165220737457275, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 12.166200755671625, "learning_rate": 1.3484124553661754e-06, "logits/chosen": -0.2774050831794739, "logits/rejected": -0.04774611443281174, "logps/chosen": -1.6932928562164307, "logps/rejected": -2.6121184825897217, "loss": 0.6716, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6932928562164307, "rewards/margins": 0.9188259840011597, "rewards/rejected": -2.6121184825897217, "sft_loss": 1.652806282043457, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 8.449276004917175, "learning_rate": 1.3437649096159e-06, "logits/chosen": -0.057559557259082794, "logits/rejected": 0.21200330555438995, "logps/chosen": -1.6470773220062256, "logps/rejected": -2.6044399738311768, "loss": 0.6359, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6470773220062256, "rewards/margins": 0.9573624730110168, "rewards/rejected": -2.6044399738311768, "sft_loss": 1.6401879787445068, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 7.952629055673774, "learning_rate": 1.3391188796591568e-06, "logits/chosen": -0.09195758402347565, "logits/rejected": 0.045628756284713745, "logps/chosen": -1.7653684616088867, "logps/rejected": -2.609598159790039, "loss": 0.6907, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7653684616088867, "rewards/margins": 0.8442295789718628, "rewards/rejected": -2.609598159790039, "sft_loss": 1.7955518960952759, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 10.776842648366552, "learning_rate": 1.3344744105717487e-06, "logits/chosen": -0.1589999496936798, "logits/rejected": 0.024349741637706757, "logps/chosen": -1.701062560081482, "logps/rejected": -2.576101779937744, "loss": 0.6631, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.701062560081482, "rewards/margins": 0.8750389218330383, "rewards/rejected": -2.576101779937744, "sft_loss": 1.7430168390274048, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 12.426345973657531, "learning_rate": 1.3298315474143354e-06, "logits/chosen": -0.018960092216730118, "logits/rejected": 0.15279479324817657, "logps/chosen": -1.675971269607544, "logps/rejected": -2.6613776683807373, "loss": 0.6498, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.675971269607544, "rewards/margins": 0.985406756401062, "rewards/rejected": -2.6613776683807373, "sft_loss": 1.7282222509384155, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 9.337865950173851, "learning_rate": 1.3251903352319951e-06, "logits/chosen": -0.14004510641098022, "logits/rejected": 0.0643129050731659, "logps/chosen": -1.6384871006011963, "logps/rejected": -2.6979575157165527, "loss": 0.6506, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6384871006011963, "rewards/margins": 1.059470534324646, "rewards/rejected": -2.6979575157165527, "sft_loss": 1.6707124710083008, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 11.94103230577254, "learning_rate": 1.3205508190537895e-06, "logits/chosen": -0.208818718791008, "logits/rejected": 0.2646576762199402, "logps/chosen": -1.664166808128357, "logps/rejected": -2.5710747241973877, "loss": 0.633, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.664166808128357, "rewards/margins": 0.9069075584411621, "rewards/rejected": -2.5710747241973877, "sft_loss": 1.705847978591919, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 10.251759506687932, "learning_rate": 1.3159130438923242e-06, "logits/chosen": -0.11879494041204453, "logits/rejected": -0.02837967872619629, "logps/chosen": -1.5379129648208618, "logps/rejected": -2.41559100151062, "loss": 0.6193, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.5379129648208618, "rewards/margins": 0.8776780962944031, "rewards/rejected": -2.41559100151062, "sft_loss": 1.629476547241211, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 9.306495320308388, "learning_rate": 1.3112770547433144e-06, "logits/chosen": -0.20200033485889435, "logits/rejected": 0.11474726349115372, "logps/chosen": -1.641689658164978, "logps/rejected": -2.5257320404052734, "loss": 0.6589, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.641689658164978, "rewards/margins": 0.8840423822402954, "rewards/rejected": -2.5257320404052734, "sft_loss": 1.6782573461532593, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 10.81878563335178, "learning_rate": 1.3066428965851472e-06, "logits/chosen": -0.09830178320407867, "logits/rejected": 0.04885398969054222, "logps/chosen": -1.6704227924346924, "logps/rejected": -2.510758876800537, "loss": 0.6876, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6704227924346924, "rewards/margins": 0.8403360247612, "rewards/rejected": -2.510758876800537, "sft_loss": 1.7284488677978516, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 9.278282957835337, "learning_rate": 1.3020106143784454e-06, "logits/chosen": -0.15651771426200867, "logits/rejected": -0.01825041137635708, "logps/chosen": -1.8048747777938843, "logps/rejected": -2.6454477310180664, "loss": 0.7163, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8048747777938843, "rewards/margins": 0.8405729532241821, "rewards/rejected": -2.6454477310180664, "sft_loss": 1.8150914907455444, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 8.75353222908878, "learning_rate": 1.2973802530656314e-06, "logits/chosen": -0.3050258755683899, "logits/rejected": -0.1058342456817627, "logps/chosen": -1.7767432928085327, "logps/rejected": -2.729957103729248, "loss": 0.6791, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7767432928085327, "rewards/margins": 0.9532135725021362, "rewards/rejected": -2.729957103729248, "sft_loss": 1.848341703414917, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 13.700892580022298, "learning_rate": 1.2927518575704906e-06, "logits/chosen": -0.28139573335647583, "logits/rejected": -0.033496059477329254, "logps/chosen": -1.7814384698867798, "logps/rejected": -2.7443580627441406, "loss": 0.6742, "rewards/accuracies": 0.75, "rewards/chosen": -1.7814384698867798, "rewards/margins": 0.9629195928573608, "rewards/rejected": -2.7443580627441406, "sft_loss": 1.786010980606079, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 9.66418456858833, "learning_rate": 1.2881254727977365e-06, "logits/chosen": -0.05890367552638054, "logits/rejected": -0.013821298256516457, "logps/chosen": -1.7495791912078857, "logps/rejected": -2.6106810569763184, "loss": 0.6464, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7495791912078857, "rewards/margins": 0.8611016273498535, "rewards/rejected": -2.6106810569763184, "sft_loss": 1.7882887125015259, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 15.941166826322611, "learning_rate": 1.2835011436325749e-06, "logits/chosen": -0.2913353741168976, "logits/rejected": -0.02926046773791313, "logps/chosen": -1.732381820678711, "logps/rejected": -2.5884523391723633, "loss": 0.6843, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.732381820678711, "rewards/margins": 0.856070339679718, "rewards/rejected": -2.5884523391723633, "sft_loss": 1.7478708028793335, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 7.3216676831575525, "learning_rate": 1.278878914940267e-06, "logits/chosen": -0.22623327374458313, "logits/rejected": 0.11279468238353729, "logps/chosen": -1.703449010848999, "logps/rejected": -2.82920503616333, "loss": 0.6434, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.703449010848999, "rewards/margins": 1.125756025314331, "rewards/rejected": -2.82920503616333, "sft_loss": 1.7550933361053467, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 9.222772282539346, "learning_rate": 1.2742588315656963e-06, "logits/chosen": -0.30736929178237915, "logits/rejected": -0.03906359151005745, "logps/chosen": -1.6967036724090576, "logps/rejected": -2.6465067863464355, "loss": 0.6487, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6967036724090576, "rewards/margins": 0.9498027563095093, "rewards/rejected": -2.6465067863464355, "sft_loss": 1.807225227355957, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 9.902930947660389, "learning_rate": 1.269640938332932e-06, "logits/chosen": -0.18472820520401, "logits/rejected": -0.04140182584524155, "logps/chosen": -1.5697466135025024, "logps/rejected": -2.574474811553955, "loss": 0.6254, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5697466135025024, "rewards/margins": 1.0047283172607422, "rewards/rejected": -2.574474811553955, "sft_loss": 1.6612666845321655, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 23.399323422517572, "learning_rate": 1.265025280044794e-06, "logits/chosen": -0.1952822506427765, "logits/rejected": 0.030080635100603104, "logps/chosen": -1.7290503978729248, "logps/rejected": -2.5522682666778564, "loss": 0.6611, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7290503978729248, "rewards/margins": 0.8232178688049316, "rewards/rejected": -2.5522682666778564, "sft_loss": 1.7029396295547485, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 11.34202465880965, "learning_rate": 1.2604119014824197e-06, "logits/chosen": -0.14711865782737732, "logits/rejected": 0.09781067818403244, "logps/chosen": -1.6153545379638672, "logps/rejected": -2.501518726348877, "loss": 0.6616, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6153545379638672, "rewards/margins": 0.8861643075942993, "rewards/rejected": -2.501518726348877, "sft_loss": 1.6534423828125, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 10.981100868803473, "learning_rate": 1.2558008474048279e-06, "logits/chosen": -0.2000230848789215, "logits/rejected": 0.08384416997432709, "logps/chosen": -1.5488859415054321, "logps/rejected": -2.4278249740600586, "loss": 0.6372, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5488859415054321, "rewards/margins": 0.8789387941360474, "rewards/rejected": -2.4278249740600586, "sft_loss": 1.6269474029541016, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 11.809243251425459, "learning_rate": 1.2511921625484857e-06, "logits/chosen": -0.3878183960914612, "logits/rejected": -0.222340390086174, "logps/chosen": -1.7283685207366943, "logps/rejected": -2.52485990524292, "loss": 0.6616, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7283685207366943, "rewards/margins": 0.796491265296936, "rewards/rejected": -2.52485990524292, "sft_loss": 1.729326605796814, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 13.993998126297274, "learning_rate": 1.2465858916268734e-06, "logits/chosen": -0.10277875512838364, "logits/rejected": -0.04876113682985306, "logps/chosen": -1.8125221729278564, "logps/rejected": -2.5582568645477295, "loss": 0.7396, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8125221729278564, "rewards/margins": 0.7457348704338074, "rewards/rejected": -2.5582568645477295, "sft_loss": 1.7993252277374268, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 11.587421995678856, "learning_rate": 1.2419820793300526e-06, "logits/chosen": -0.30335888266563416, "logits/rejected": -0.002621948719024658, "logps/chosen": -1.6230109930038452, "logps/rejected": -2.533942699432373, "loss": 0.6614, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6230109930038452, "rewards/margins": 0.9109317064285278, "rewards/rejected": -2.533942699432373, "sft_loss": 1.6495774984359741, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 11.811907418936265, "learning_rate": 1.2373807703242293e-06, "logits/chosen": -0.3585537374019623, "logits/rejected": -0.08721666783094406, "logps/chosen": -1.7728416919708252, "logps/rejected": -2.6852431297302246, "loss": 0.6668, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7728416919708252, "rewards/margins": 0.9124017953872681, "rewards/rejected": -2.6852431297302246, "sft_loss": 1.833508849143982, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 11.735212113421865, "learning_rate": 1.232782009251324e-06, "logits/chosen": -0.3023667633533478, "logits/rejected": -0.05005481094121933, "logps/chosen": -1.763489007949829, "logps/rejected": -2.484192371368408, "loss": 0.7278, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.763489007949829, "rewards/margins": 0.720703661441803, "rewards/rejected": -2.484192371368408, "sft_loss": 1.8116620779037476, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 13.400719562159768, "learning_rate": 1.228185840728537e-06, "logits/chosen": -0.07832861691713333, "logits/rejected": -0.011995521374046803, "logps/chosen": -1.8001636266708374, "logps/rejected": -2.66902232170105, "loss": 0.728, "rewards/accuracies": 0.75, "rewards/chosen": -1.8001636266708374, "rewards/margins": 0.8688589334487915, "rewards/rejected": -2.66902232170105, "sft_loss": 1.8136160373687744, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 8.349033077110981, "learning_rate": 1.2235923093479156e-06, "logits/chosen": -0.3206074833869934, "logits/rejected": -0.07655216008424759, "logps/chosen": -1.666358232498169, "logps/rejected": -2.565678358078003, "loss": 0.6547, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.666358232498169, "rewards/margins": 0.8993202447891235, "rewards/rejected": -2.565678358078003, "sft_loss": 1.6472370624542236, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 9.222918908639995, "learning_rate": 1.219001459675921e-06, "logits/chosen": -0.2530183792114258, "logits/rejected": -0.23013608157634735, "logps/chosen": -1.67684805393219, "logps/rejected": -2.330507516860962, "loss": 0.7125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.67684805393219, "rewards/margins": 0.6536593437194824, "rewards/rejected": -2.330507516860962, "sft_loss": 1.6849253177642822, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 11.003158412672942, "learning_rate": 1.2144133362529974e-06, "logits/chosen": -0.2570267617702484, "logits/rejected": -0.04421461373567581, "logps/chosen": -1.7353019714355469, "logps/rejected": -2.479492664337158, "loss": 0.7085, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7353019714355469, "rewards/margins": 0.7441903948783875, "rewards/rejected": -2.479492664337158, "sft_loss": 1.7561286687850952, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 11.206430050702487, "learning_rate": 1.2098279835931382e-06, "logits/chosen": -0.2855226397514343, "logits/rejected": -0.10674687474966049, "logps/chosen": -1.5438029766082764, "logps/rejected": -2.4865102767944336, "loss": 0.6322, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5438029766082764, "rewards/margins": 0.9427075386047363, "rewards/rejected": -2.4865102767944336, "sft_loss": 1.5557372570037842, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 8.682823478026071, "learning_rate": 1.2052454461834544e-06, "logits/chosen": -0.17236191034317017, "logits/rejected": 0.019247237592935562, "logps/chosen": -1.6885963678359985, "logps/rejected": -2.52380108833313, "loss": 0.6775, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6885963678359985, "rewards/margins": 0.8352048993110657, "rewards/rejected": -2.52380108833313, "sft_loss": 1.705102562904358, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 12.38921176122187, "learning_rate": 1.2006657684837445e-06, "logits/chosen": -0.25058725476264954, "logits/rejected": -0.04623941332101822, "logps/chosen": -1.6525062322616577, "logps/rejected": -2.4225525856018066, "loss": 0.672, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6525062322616577, "rewards/margins": 0.7700467109680176, "rewards/rejected": -2.4225525856018066, "sft_loss": 1.7160422801971436, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 9.703765936226722, "learning_rate": 1.1960889949260613e-06, "logits/chosen": -0.2633149027824402, "logits/rejected": 0.08391741663217545, "logps/chosen": -1.7922632694244385, "logps/rejected": -2.605243682861328, "loss": 0.6699, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7922632694244385, "rewards/margins": 0.8129804730415344, "rewards/rejected": -2.605243682861328, "sft_loss": 1.7857837677001953, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 17.54487018281387, "learning_rate": 1.1915151699142825e-06, "logits/chosen": -0.29941219091415405, "logits/rejected": -0.12223385274410248, "logps/chosen": -1.7729759216308594, "logps/rejected": -2.7748351097106934, "loss": 0.6732, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7729759216308594, "rewards/margins": 1.0018593072891235, "rewards/rejected": -2.7748351097106934, "sft_loss": 1.8728386163711548, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 19.782965861794818, "learning_rate": 1.1869443378236782e-06, "logits/chosen": -0.13840122520923615, "logits/rejected": 0.024842610582709312, "logps/chosen": -1.9172168970108032, "logps/rejected": -2.925680637359619, "loss": 0.7141, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9172168970108032, "rewards/margins": 1.008463740348816, "rewards/rejected": -2.925680637359619, "sft_loss": 1.9686994552612305, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 9.29544600728352, "learning_rate": 1.1823765430004812e-06, "logits/chosen": -0.2544843554496765, "logits/rejected": -0.1964297890663147, "logps/chosen": -1.7466745376586914, "logps/rejected": -2.728344678878784, "loss": 0.6701, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7466745376586914, "rewards/margins": 0.9816702604293823, "rewards/rejected": -2.728344678878784, "sft_loss": 1.7416541576385498, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 9.722002501123056, "learning_rate": 1.177811829761457e-06, "logits/chosen": -0.21858203411102295, "logits/rejected": -0.038083307445049286, "logps/chosen": -1.7050504684448242, "logps/rejected": -2.841707944869995, "loss": 0.6266, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7050504684448242, "rewards/margins": 1.136657476425171, "rewards/rejected": -2.841707944869995, "sft_loss": 1.7132253646850586, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 12.306560826422983, "learning_rate": 1.1732502423934737e-06, "logits/chosen": -0.21269071102142334, "logits/rejected": -0.06579498946666718, "logps/chosen": -1.6928743124008179, "logps/rejected": -2.6516270637512207, "loss": 0.6169, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6928743124008179, "rewards/margins": 0.9587525129318237, "rewards/rejected": -2.6516270637512207, "sft_loss": 1.7344623804092407, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 14.207434505489841, "learning_rate": 1.1686918251530716e-06, "logits/chosen": -0.2616156041622162, "logits/rejected": -0.11248783767223358, "logps/chosen": -1.6836540699005127, "logps/rejected": -2.8523545265197754, "loss": 0.6497, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6836540699005127, "rewards/margins": 1.1687004566192627, "rewards/rejected": -2.8523545265197754, "sft_loss": 1.7119197845458984, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 10.777338659368121, "learning_rate": 1.164136622266035e-06, "logits/chosen": -0.2616646885871887, "logits/rejected": 0.056531958281993866, "logps/chosen": -1.7460088729858398, "logps/rejected": -2.6776375770568848, "loss": 0.6629, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7460088729858398, "rewards/margins": 0.9316285848617554, "rewards/rejected": -2.6776375770568848, "sft_loss": 1.8117141723632812, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 13.964630061751256, "learning_rate": 1.1595846779269622e-06, "logits/chosen": -0.3164612650871277, "logits/rejected": -0.06304170936346054, "logps/chosen": -1.723750114440918, "logps/rejected": -2.711017608642578, "loss": 0.6582, "rewards/accuracies": 0.78125, "rewards/chosen": -1.723750114440918, "rewards/margins": 0.9872674942016602, "rewards/rejected": -2.711017608642578, "sft_loss": 1.7842611074447632, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 9.837501884568988, "learning_rate": 1.155036036298837e-06, "logits/chosen": -0.20438392460346222, "logits/rejected": 0.09091716259717941, "logps/chosen": -1.8601843118667603, "logps/rejected": -2.813197374343872, "loss": 0.7041, "rewards/accuracies": 0.75, "rewards/chosen": -1.8601843118667603, "rewards/margins": 0.9530132412910461, "rewards/rejected": -2.813197374343872, "sft_loss": 1.8708412647247314, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 10.723020602530795, "learning_rate": 1.1504907415126008e-06, "logits/chosen": -0.045906491577625275, "logits/rejected": 0.10496882349252701, "logps/chosen": -1.7108036279678345, "logps/rejected": -2.6833956241607666, "loss": 0.6472, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7108036279678345, "rewards/margins": 0.9725920557975769, "rewards/rejected": -2.6833956241607666, "sft_loss": 1.714491605758667, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 7.670794619054316, "learning_rate": 1.1459488376667235e-06, "logits/chosen": -0.20854513347148895, "logits/rejected": -0.055694401264190674, "logps/chosen": -1.5863817930221558, "logps/rejected": -2.354175329208374, "loss": 0.6701, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5863817930221558, "rewards/margins": 0.7677937150001526, "rewards/rejected": -2.354175329208374, "sft_loss": 1.597285509109497, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 8.962136533440228, "learning_rate": 1.1414103688267756e-06, "logits/chosen": -0.20327985286712646, "logits/rejected": -0.07620219141244888, "logps/chosen": -1.7575534582138062, "logps/rejected": -2.6742453575134277, "loss": 0.6806, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7575534582138062, "rewards/margins": 0.9166919589042664, "rewards/rejected": -2.6742453575134277, "sft_loss": 1.771837592124939, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 14.298445634777934, "learning_rate": 1.136875379025002e-06, "logits/chosen": -0.16482272744178772, "logits/rejected": -0.09752872586250305, "logps/chosen": -1.657428503036499, "logps/rejected": -2.488154172897339, "loss": 0.6659, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.657428503036499, "rewards/margins": 0.8307255506515503, "rewards/rejected": -2.488154172897339, "sft_loss": 1.6614271402359009, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 9.376689059470104, "learning_rate": 1.132343912259894e-06, "logits/chosen": -0.02655973471701145, "logits/rejected": 0.017931750044226646, "logps/chosen": -1.7234824895858765, "logps/rejected": -2.5292980670928955, "loss": 0.6903, "rewards/accuracies": 0.75, "rewards/chosen": -1.7234824895858765, "rewards/margins": 0.8058153986930847, "rewards/rejected": -2.5292980670928955, "sft_loss": 1.782034158706665, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 10.422796001151918, "learning_rate": 1.1278160124957617e-06, "logits/chosen": -0.08756308257579803, "logits/rejected": 0.10158822685480118, "logps/chosen": -1.6388031244277954, "logps/rejected": -2.4249258041381836, "loss": 0.6825, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6388031244277954, "rewards/margins": 0.7861226797103882, "rewards/rejected": -2.4249258041381836, "sft_loss": 1.7268263101577759, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 9.90934852486259, "learning_rate": 1.1232917236623085e-06, "logits/chosen": -0.09540453553199768, "logits/rejected": 0.0738372951745987, "logps/chosen": -1.6657747030258179, "logps/rejected": -2.4016175270080566, "loss": 0.6755, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6657747030258179, "rewards/margins": 0.7358425855636597, "rewards/rejected": -2.4016175270080566, "sft_loss": 1.7676925659179688, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 12.004707107522934, "learning_rate": 1.1187710896542045e-06, "logits/chosen": -0.25235700607299805, "logits/rejected": 0.0003812193754129112, "logps/chosen": -1.7461745738983154, "logps/rejected": -2.481781005859375, "loss": 0.6677, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7461745738983154, "rewards/margins": 0.7356064915657043, "rewards/rejected": -2.481781005859375, "sft_loss": 1.7998241186141968, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 12.96635107429613, "learning_rate": 1.1142541543306603e-06, "logits/chosen": -0.07630597800016403, "logits/rejected": 0.15812484920024872, "logps/chosen": -1.6824067831039429, "logps/rejected": -2.72544264793396, "loss": 0.647, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6824067831039429, "rewards/margins": 1.0430357456207275, "rewards/rejected": -2.72544264793396, "sft_loss": 1.7791961431503296, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 12.035501698692357, "learning_rate": 1.109740961515003e-06, "logits/chosen": -0.18747368454933167, "logits/rejected": 0.032625712454319, "logps/chosen": -1.709657907485962, "logps/rejected": -2.7182281017303467, "loss": 0.6193, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.709657907485962, "rewards/margins": 1.0085701942443848, "rewards/rejected": -2.7182281017303467, "sft_loss": 1.7642666101455688, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 15.913458782743897, "learning_rate": 1.1052315549942487e-06, "logits/chosen": -0.17805495858192444, "logits/rejected": -0.04204495996236801, "logps/chosen": -1.6713043451309204, "logps/rejected": -2.604113817214966, "loss": 0.6414, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6713043451309204, "rewards/margins": 0.9328095316886902, "rewards/rejected": -2.604113817214966, "sft_loss": 1.707510232925415, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 13.819186328459246, "learning_rate": 1.100725978518679e-06, "logits/chosen": -0.19229435920715332, "logits/rejected": 0.12436362355947495, "logps/chosen": -1.8118988275527954, "logps/rejected": -2.694197654724121, "loss": 0.6863, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8118988275527954, "rewards/margins": 0.8822988271713257, "rewards/rejected": -2.694197654724121, "sft_loss": 1.8271783590316772, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 12.142640316077669, "learning_rate": 1.0962242758014169e-06, "logits/chosen": -0.24944384396076202, "logits/rejected": 0.014292346313595772, "logps/chosen": -1.7131149768829346, "logps/rejected": -2.722731590270996, "loss": 0.6575, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7131149768829346, "rewards/margins": 1.009616494178772, "rewards/rejected": -2.722731590270996, "sft_loss": 1.7885093688964844, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 8.832463656004036, "learning_rate": 1.091726490518002e-06, "logits/chosen": -0.14067216217517853, "logits/rejected": 0.15703320503234863, "logps/chosen": -1.722751259803772, "logps/rejected": -2.6994786262512207, "loss": 0.6509, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.722751259803772, "rewards/margins": 0.9767271280288696, "rewards/rejected": -2.6994786262512207, "sft_loss": 1.7842895984649658, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 12.074613464208511, "learning_rate": 1.0872326663059668e-06, "logits/chosen": -0.13440512120723724, "logits/rejected": -0.029331039637327194, "logps/chosen": -1.7340189218521118, "logps/rejected": -2.644932270050049, "loss": 0.673, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7340189218521118, "rewards/margins": 0.910913348197937, "rewards/rejected": -2.644932270050049, "sft_loss": 1.845868706703186, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 9.783673762859099, "learning_rate": 1.0827428467644132e-06, "logits/chosen": -0.23015666007995605, "logits/rejected": -0.028845742344856262, "logps/chosen": -1.6154110431671143, "logps/rejected": -2.612565279006958, "loss": 0.6492, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6154110431671143, "rewards/margins": 0.9971543550491333, "rewards/rejected": -2.612565279006958, "sft_loss": 1.6820234060287476, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 12.68272767354526, "learning_rate": 1.0782570754535903e-06, "logits/chosen": -0.2367834597826004, "logits/rejected": 0.10590411722660065, "logps/chosen": -1.6864131689071655, "logps/rejected": -2.4310550689697266, "loss": 0.6853, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6864131689071655, "rewards/margins": 0.744641900062561, "rewards/rejected": -2.4310550689697266, "sft_loss": 1.7294981479644775, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 10.089494425507207, "learning_rate": 1.0737753958944712e-06, "logits/chosen": -0.38933509588241577, "logits/rejected": 0.024811876937747, "logps/chosen": -1.6396119594573975, "logps/rejected": -2.5669679641723633, "loss": 0.6176, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6396119594573975, "rewards/margins": 0.927355945110321, "rewards/rejected": -2.5669679641723633, "sft_loss": 1.6741783618927002, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 10.666231075184285, "learning_rate": 1.0692978515683305e-06, "logits/chosen": -0.09203028678894043, "logits/rejected": 0.036007750779390335, "logps/chosen": -1.7163540124893188, "logps/rejected": -2.5543947219848633, "loss": 0.6857, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7163540124893188, "rewards/margins": 0.8380409479141235, "rewards/rejected": -2.5543947219848633, "sft_loss": 1.667665719985962, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 9.227970918355686, "learning_rate": 1.0648244859163227e-06, "logits/chosen": -0.278619647026062, "logits/rejected": -0.07375986874103546, "logps/chosen": -1.6772725582122803, "logps/rejected": -2.598586082458496, "loss": 0.6961, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6772725582122803, "rewards/margins": 0.9213134050369263, "rewards/rejected": -2.598586082458496, "sft_loss": 1.693250060081482, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 9.376703127692286, "learning_rate": 1.0603553423390612e-06, "logits/chosen": -0.21297414600849152, "logits/rejected": -0.047949619591236115, "logps/chosen": -1.6670162677764893, "logps/rejected": -2.5751991271972656, "loss": 0.658, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6670162677764893, "rewards/margins": 0.9081829190254211, "rewards/rejected": -2.5751991271972656, "sft_loss": 1.7023769617080688, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 11.975502715594226, "learning_rate": 1.0558904641961966e-06, "logits/chosen": -0.14659383893013, "logits/rejected": -0.027252143248915672, "logps/chosen": -1.6418933868408203, "logps/rejected": -2.7534546852111816, "loss": 0.6329, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6418933868408203, "rewards/margins": 1.1115614175796509, "rewards/rejected": -2.7534546852111816, "sft_loss": 1.7112839221954346, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 7.8401392786102715, "learning_rate": 1.0514298948059961e-06, "logits/chosen": -0.2800833582878113, "logits/rejected": -0.029077952727675438, "logps/chosen": -1.6596229076385498, "logps/rejected": -2.5685582160949707, "loss": 0.6284, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6596229076385498, "rewards/margins": 0.9089350700378418, "rewards/rejected": -2.5685582160949707, "sft_loss": 1.6742641925811768, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 8.695115595309554, "learning_rate": 1.0469736774449235e-06, "logits/chosen": -0.10720052570104599, "logits/rejected": 0.06181992217898369, "logps/chosen": -1.6598479747772217, "logps/rejected": -2.563051700592041, "loss": 0.6968, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6598479747772217, "rewards/margins": 0.9032036662101746, "rewards/rejected": -2.563051700592041, "sft_loss": 1.6553401947021484, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 10.500319573242674, "learning_rate": 1.0425218553472193e-06, "logits/chosen": -0.18654967844486237, "logits/rejected": -0.10578273236751556, "logps/chosen": -1.5896636247634888, "logps/rejected": -2.603238821029663, "loss": 0.6075, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5896636247634888, "rewards/margins": 1.0135753154754639, "rewards/rejected": -2.603238821029663, "sft_loss": 1.6410157680511475, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 8.631834709897678, "learning_rate": 1.038074471704481e-06, "logits/chosen": -0.054431747645139694, "logits/rejected": 0.03705093264579773, "logps/chosen": -1.7226107120513916, "logps/rejected": -2.614551544189453, "loss": 0.6581, "rewards/accuracies": 0.75, "rewards/chosen": -1.7226107120513916, "rewards/margins": 0.8919405937194824, "rewards/rejected": -2.614551544189453, "sft_loss": 1.818695306777954, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 10.026498518478865, "learning_rate": 1.033631569665244e-06, "logits/chosen": -0.1128377690911293, "logits/rejected": -0.022760801017284393, "logps/chosen": -1.6746219396591187, "logps/rejected": -2.4234249591827393, "loss": 0.6989, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6746219396591187, "rewards/margins": 0.7488031387329102, "rewards/rejected": -2.4234249591827393, "sft_loss": 1.7145401239395142, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 6.990504351673148, "learning_rate": 1.0291931923345635e-06, "logits/chosen": -0.2978639006614685, "logits/rejected": 0.0011643856996670365, "logps/chosen": -1.6716158390045166, "logps/rejected": -2.5651087760925293, "loss": 0.6452, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6716158390045166, "rewards/margins": 0.8934928774833679, "rewards/rejected": -2.5651087760925293, "sft_loss": 1.6098390817642212, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 9.659517645727767, "learning_rate": 1.0247593827735966e-06, "logits/chosen": -0.12814059853553772, "logits/rejected": 0.1287405639886856, "logps/chosen": -1.687361717224121, "logps/rejected": -2.841590404510498, "loss": 0.6451, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.687361717224121, "rewards/margins": 1.154228687286377, "rewards/rejected": -2.841590404510498, "sft_loss": 1.7420556545257568, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": 0.30112695693969727, "eval_logits/rejected": 0.42516347765922546, "eval_logps/chosen": -1.7517114877700806, "eval_logps/rejected": -2.491905450820923, "eval_loss": 0.721508800983429, "eval_rewards/accuracies": 0.6913946866989136, "eval_rewards/chosen": -1.7517114877700806, "eval_rewards/margins": 0.7401941418647766, "eval_rewards/rejected": -2.491905450820923, "eval_runtime": 44.6434, "eval_samples_per_second": 30.128, "eval_sft_loss": 1.7451032400131226, "eval_steps_per_second": 7.549, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 11.010850184147426, "learning_rate": 1.0203301839991816e-06, "logits/chosen": -0.24366919696331024, "logits/rejected": -0.20541080832481384, "logps/chosen": -1.624942421913147, "logps/rejected": -2.368537187576294, "loss": 0.6968, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.624942421913147, "rewards/margins": 0.743594765663147, "rewards/rejected": -2.368537187576294, "sft_loss": 1.6673892736434937, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 8.505387043264376, "learning_rate": 1.0159056389834254e-06, "logits/chosen": -0.25455576181411743, "logits/rejected": -0.04168150946497917, "logps/chosen": -1.6813017129898071, "logps/rejected": -2.592050790786743, "loss": 0.6255, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6813017129898071, "rewards/margins": 0.9107491374015808, "rewards/rejected": -2.592050790786743, "sft_loss": 1.7684290409088135, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 8.924174869577604, "learning_rate": 1.0114857906532827e-06, "logits/chosen": -0.12101688235998154, "logits/rejected": 0.046328797936439514, "logps/chosen": -1.7440468072891235, "logps/rejected": -2.6186728477478027, "loss": 0.6596, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7440468072891235, "rewards/margins": 0.8746261596679688, "rewards/rejected": -2.6186728477478027, "sft_loss": 1.7590080499649048, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 18.61178170083434, "learning_rate": 1.0070706818901417e-06, "logits/chosen": -0.1992807686328888, "logits/rejected": -0.07354742288589478, "logps/chosen": -1.83464777469635, "logps/rejected": -2.608799695968628, "loss": 0.7223, "rewards/accuracies": 0.75, "rewards/chosen": -1.83464777469635, "rewards/margins": 0.7741519808769226, "rewards/rejected": -2.608799695968628, "sft_loss": 1.8808612823486328, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 9.162346431887451, "learning_rate": 1.0026603555294073e-06, "logits/chosen": -0.08581370115280151, "logits/rejected": -0.12211551517248154, "logps/chosen": -1.6881921291351318, "logps/rejected": -2.5504727363586426, "loss": 0.6617, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6881921291351318, "rewards/margins": 0.862280547618866, "rewards/rejected": -2.5504727363586426, "sft_loss": 1.743970513343811, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 12.001438140317974, "learning_rate": 9.982548543600843e-07, "logits/chosen": -0.17960326373577118, "logits/rejected": -0.14128082990646362, "logps/chosen": -1.7421640157699585, "logps/rejected": -2.7152018547058105, "loss": 0.6898, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7421640157699585, "rewards/margins": 0.973037600517273, "rewards/rejected": -2.7152018547058105, "sft_loss": 1.8456947803497314, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 10.323550418368065, "learning_rate": 9.93854221124365e-07, "logits/chosen": -0.2688259184360504, "logits/rejected": -0.14841492474079132, "logps/chosen": -1.670474648475647, "logps/rejected": -2.6174325942993164, "loss": 0.6382, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.670474648475647, "rewards/margins": 0.9469578862190247, "rewards/rejected": -2.6174325942993164, "sft_loss": 1.7346309423446655, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 11.719082491329008, "learning_rate": 9.894584985172121e-07, "logits/chosen": -0.1979060173034668, "logits/rejected": -0.10562784969806671, "logps/chosen": -1.806494116783142, "logps/rejected": -2.617361068725586, "loss": 0.6811, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.806494116783142, "rewards/margins": 0.8108669519424438, "rewards/rejected": -2.617361068725586, "sft_loss": 1.851362943649292, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 13.081773180078448, "learning_rate": 9.850677291859458e-07, "logits/chosen": -0.22949905693531036, "logits/rejected": -0.04544057324528694, "logps/chosen": -1.9166128635406494, "logps/rejected": -2.5818140506744385, "loss": 0.74, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9166128635406494, "rewards/margins": 0.6652010083198547, "rewards/rejected": -2.5818140506744385, "sft_loss": 1.9522733688354492, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 10.408701831951982, "learning_rate": 9.806819557298295e-07, "logits/chosen": -0.27124810218811035, "logits/rejected": -0.12528440356254578, "logps/chosen": -1.7751926183700562, "logps/rejected": -2.6433446407318115, "loss": 0.6761, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7751926183700562, "rewards/margins": 0.8681520223617554, "rewards/rejected": -2.6433446407318115, "sft_loss": 1.8149782419204712, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 9.351815707751856, "learning_rate": 9.76301220699656e-07, "logits/chosen": -0.2165641337633133, "logits/rejected": -0.04734504967927933, "logps/chosen": -1.7876564264297485, "logps/rejected": -2.685206651687622, "loss": 0.6636, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7876564264297485, "rewards/margins": 0.8975504040718079, "rewards/rejected": -2.685206651687622, "sft_loss": 1.7731033563613892, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 9.844648724419237, "learning_rate": 9.719255665973365e-07, "logits/chosen": -0.26721853017807007, "logits/rejected": -0.05507396534085274, "logps/chosen": -1.7108303308486938, "logps/rejected": -2.5796587467193604, "loss": 0.6972, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7108303308486938, "rewards/margins": 0.8688281774520874, "rewards/rejected": -2.5796587467193604, "sft_loss": 1.7844226360321045, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 10.436179068836465, "learning_rate": 9.675550358754857e-07, "logits/chosen": -0.21287234127521515, "logits/rejected": -0.07435743510723114, "logps/chosen": -1.6167386770248413, "logps/rejected": -2.557102680206299, "loss": 0.6452, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6167386770248413, "rewards/margins": 0.940363883972168, "rewards/rejected": -2.557102680206299, "sft_loss": 1.6063480377197266, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 8.699087438372885, "learning_rate": 9.631896709370124e-07, "logits/chosen": -0.2651059627532959, "logits/rejected": -0.07905270159244537, "logps/chosen": -1.6744390726089478, "logps/rejected": -2.8000216484069824, "loss": 0.5949, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6744390726089478, "rewards/margins": 1.1255824565887451, "rewards/rejected": -2.8000216484069824, "sft_loss": 1.7917518615722656, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 9.416539046803988, "learning_rate": 9.588295141347055e-07, "logits/chosen": -0.23763029277324677, "logits/rejected": -0.04285631328821182, "logps/chosen": -1.8838889598846436, "logps/rejected": -2.9585506916046143, "loss": 0.6696, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8838889598846436, "rewards/margins": 1.0746616125106812, "rewards/rejected": -2.9585506916046143, "sft_loss": 1.905796766281128, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 9.816601872866762, "learning_rate": 9.544746077708263e-07, "logits/chosen": -0.23911122977733612, "logits/rejected": -0.06813491880893707, "logps/chosen": -1.5476216077804565, "logps/rejected": -2.4001009464263916, "loss": 0.6338, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5476216077804565, "rewards/margins": 0.8524792790412903, "rewards/rejected": -2.4001009464263916, "sft_loss": 1.5902055501937866, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 13.803323828800195, "learning_rate": 9.50124994096695e-07, "logits/chosen": -0.24263215065002441, "logits/rejected": -0.1326196938753128, "logps/chosen": -1.6562252044677734, "logps/rejected": -2.5973217487335205, "loss": 0.6646, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6562252044677734, "rewards/margins": 0.9410964250564575, "rewards/rejected": -2.5973217487335205, "sft_loss": 1.7516815662384033, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 11.91395105204308, "learning_rate": 9.457807153122826e-07, "logits/chosen": -0.23275959491729736, "logits/rejected": 0.011137251742184162, "logps/chosen": -1.6774663925170898, "logps/rejected": -2.5975823402404785, "loss": 0.6629, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6774663925170898, "rewards/margins": 0.9201158285140991, "rewards/rejected": -2.5975823402404785, "sft_loss": 1.6803147792816162, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 12.148506432063686, "learning_rate": 9.41441813565801e-07, "logits/chosen": -0.1868276298046112, "logits/rejected": -0.0998372808098793, "logps/chosen": -1.7477929592132568, "logps/rejected": -2.61071515083313, "loss": 0.6853, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7477929592132568, "rewards/margins": 0.8629220724105835, "rewards/rejected": -2.61071515083313, "sft_loss": 1.8512369394302368, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 10.420336382904948, "learning_rate": 9.371083309532938e-07, "logits/chosen": -0.1475117802619934, "logits/rejected": 0.02056516334414482, "logps/chosen": -1.6154321432113647, "logps/rejected": -2.4313552379608154, "loss": 0.6518, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6154321432113647, "rewards/margins": 0.8159233331680298, "rewards/rejected": -2.4313552379608154, "sft_loss": 1.6552836894989014, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 9.122500210468697, "learning_rate": 9.327803095182284e-07, "logits/chosen": -0.23152145743370056, "logits/rejected": -0.10704489797353745, "logps/chosen": -1.7144361734390259, "logps/rejected": -2.6252634525299072, "loss": 0.6542, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7144361734390259, "rewards/margins": 0.9108272790908813, "rewards/rejected": -2.6252634525299072, "sft_loss": 1.714613676071167, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 12.112549897952343, "learning_rate": 9.28457791251088e-07, "logits/chosen": -0.024995137006044388, "logits/rejected": 0.034917961806058884, "logps/chosen": -1.7349681854248047, "logps/rejected": -2.5234830379486084, "loss": 0.7018, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7349681854248047, "rewards/margins": 0.7885150909423828, "rewards/rejected": -2.5234830379486084, "sft_loss": 1.8175227642059326, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 12.249636751085148, "learning_rate": 9.241408180889638e-07, "logits/chosen": -0.16720399260520935, "logits/rejected": -0.0917845070362091, "logps/chosen": -1.706947922706604, "logps/rejected": -2.6415350437164307, "loss": 0.6606, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.706947922706604, "rewards/margins": 0.9345870018005371, "rewards/rejected": -2.6415350437164307, "sft_loss": 1.7958767414093018, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 7.905937307037574, "learning_rate": 9.198294319151478e-07, "logits/chosen": -0.210923433303833, "logits/rejected": -0.0747196301817894, "logps/chosen": -1.6869720220565796, "logps/rejected": -2.444392681121826, "loss": 0.6758, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6869720220565796, "rewards/margins": 0.7574207186698914, "rewards/rejected": -2.444392681121826, "sft_loss": 1.7059062719345093, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 10.154743185918921, "learning_rate": 9.155236745587279e-07, "logits/chosen": -0.2889309823513031, "logits/rejected": -0.17309871315956116, "logps/chosen": -1.7106233835220337, "logps/rejected": -2.653170347213745, "loss": 0.6423, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7106233835220337, "rewards/margins": 0.9425467252731323, "rewards/rejected": -2.653170347213745, "sft_loss": 1.8067913055419922, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 8.991117835212687, "learning_rate": 9.112235877941808e-07, "logits/chosen": -0.20675882697105408, "logits/rejected": -0.013511359691619873, "logps/chosen": -1.6930301189422607, "logps/rejected": -2.5471932888031006, "loss": 0.645, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6930301189422607, "rewards/margins": 0.8541630506515503, "rewards/rejected": -2.5471932888031006, "sft_loss": 1.7349340915679932, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 12.274052101700097, "learning_rate": 9.069292133409672e-07, "logits/chosen": -0.12488467991352081, "logits/rejected": -0.014732986688613892, "logps/chosen": -1.7833898067474365, "logps/rejected": -2.6352057456970215, "loss": 0.6967, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7833898067474365, "rewards/margins": 0.8518158197402954, "rewards/rejected": -2.6352057456970215, "sft_loss": 1.7981497049331665, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 9.607873466517391, "learning_rate": 9.026405928631269e-07, "logits/chosen": -0.1588951051235199, "logits/rejected": -0.09040139615535736, "logps/chosen": -1.7691665887832642, "logps/rejected": -2.666560649871826, "loss": 0.64, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7691665887832642, "rewards/margins": 0.8973941802978516, "rewards/rejected": -2.666560649871826, "sft_loss": 1.7730439901351929, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 9.407713788560867, "learning_rate": 8.983577679688745e-07, "logits/chosen": -0.16986750066280365, "logits/rejected": -0.04225274175405502, "logps/chosen": -1.6801502704620361, "logps/rejected": -2.897521495819092, "loss": 0.5563, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.6801502704620361, "rewards/margins": 1.2173711061477661, "rewards/rejected": -2.897521495819092, "sft_loss": 1.722196340560913, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 7.194690814469289, "learning_rate": 8.940807802101961e-07, "logits/chosen": -0.26174548268318176, "logits/rejected": -0.11660508811473846, "logps/chosen": -1.5555036067962646, "logps/rejected": -2.8029425144195557, "loss": 0.5484, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5555036067962646, "rewards/margins": 1.2474387884140015, "rewards/rejected": -2.8029425144195557, "sft_loss": 1.6362333297729492, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 10.458112251228574, "learning_rate": 8.898096710824455e-07, "logits/chosen": -0.23806282877922058, "logits/rejected": -0.06314180046319962, "logps/chosen": -1.655256986618042, "logps/rejected": -2.9726920127868652, "loss": 0.5568, "rewards/accuracies": 0.875, "rewards/chosen": -1.655256986618042, "rewards/margins": 1.3174351453781128, "rewards/rejected": -2.9726920127868652, "sft_loss": 1.7911889553070068, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 8.243695700196932, "learning_rate": 8.855444820239421e-07, "logits/chosen": -0.316383421421051, "logits/rejected": -0.25597572326660156, "logps/chosen": -1.7021299600601196, "logps/rejected": -3.0542609691619873, "loss": 0.5715, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7021299600601196, "rewards/margins": 1.3521312475204468, "rewards/rejected": -3.0542609691619873, "sft_loss": 1.8004076480865479, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 11.021054218487246, "learning_rate": 8.812852544155691e-07, "logits/chosen": -0.2360994815826416, "logits/rejected": 0.059898506850004196, "logps/chosen": -1.7925751209259033, "logps/rejected": -3.2021727561950684, "loss": 0.5503, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7925751209259033, "rewards/margins": 1.409597635269165, "rewards/rejected": -3.2021727561950684, "sft_loss": 1.8459718227386475, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 8.89962898665759, "learning_rate": 8.770320295803714e-07, "logits/chosen": -0.297813355922699, "logits/rejected": -0.03503280505537987, "logps/chosen": -1.6437908411026, "logps/rejected": -3.228015899658203, "loss": 0.5184, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.6437908411026, "rewards/margins": 1.5842252969741821, "rewards/rejected": -3.228015899658203, "sft_loss": 1.7362667322158813, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 9.527797834634185, "learning_rate": 8.727848487831545e-07, "logits/chosen": -0.20444516837596893, "logits/rejected": -0.15804891288280487, "logps/chosen": -1.7272441387176514, "logps/rejected": -3.051990270614624, "loss": 0.5538, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7272441387176514, "rewards/margins": 1.3247458934783936, "rewards/rejected": -3.051990270614624, "sft_loss": 1.751915693283081, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 7.461805123404374, "learning_rate": 8.685437532300863e-07, "logits/chosen": -0.16066284477710724, "logits/rejected": -0.12323828786611557, "logps/chosen": -1.714807152748108, "logps/rejected": -3.0106585025787354, "loss": 0.5737, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.714807152748108, "rewards/margins": 1.2958513498306274, "rewards/rejected": -3.0106585025787354, "sft_loss": 1.7871978282928467, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 13.861265215219532, "learning_rate": 8.64308784068293e-07, "logits/chosen": -0.22177617251873016, "logits/rejected": 0.01492443960160017, "logps/chosen": -1.7709490060806274, "logps/rejected": -3.111344575881958, "loss": 0.5605, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.7709490060806274, "rewards/margins": 1.340395212173462, "rewards/rejected": -3.111344575881958, "sft_loss": 1.8111011981964111, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 8.161077241804115, "learning_rate": 8.600799823854655e-07, "logits/chosen": -0.27392780780792236, "logits/rejected": -0.025870636105537415, "logps/chosen": -1.693743348121643, "logps/rejected": -3.0894863605499268, "loss": 0.546, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.693743348121643, "rewards/margins": 1.3957430124282837, "rewards/rejected": -3.0894863605499268, "sft_loss": 1.7753608226776123, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 13.150982071143991, "learning_rate": 8.558573892094547e-07, "logits/chosen": -0.11018764972686768, "logits/rejected": -0.09104446321725845, "logps/chosen": -1.6371879577636719, "logps/rejected": -2.8337721824645996, "loss": 0.5863, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6371879577636719, "rewards/margins": 1.1965839862823486, "rewards/rejected": -2.8337721824645996, "sft_loss": 1.7764679193496704, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 11.446826811574992, "learning_rate": 8.516410455078793e-07, "logits/chosen": -0.18594589829444885, "logits/rejected": 0.011191355995833874, "logps/chosen": -1.7263634204864502, "logps/rejected": -3.07102632522583, "loss": 0.5725, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7263634204864502, "rewards/margins": 1.3446629047393799, "rewards/rejected": -3.07102632522583, "sft_loss": 1.8603509664535522, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 10.537343367924715, "learning_rate": 8.474309921877238e-07, "logits/chosen": -0.16531623899936676, "logits/rejected": 0.026289869099855423, "logps/chosen": -1.611098051071167, "logps/rejected": -2.9250524044036865, "loss": 0.5398, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.611098051071167, "rewards/margins": 1.3139547109603882, "rewards/rejected": -2.9250524044036865, "sft_loss": 1.710699439048767, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 10.748584753986586, "learning_rate": 8.432272700949452e-07, "logits/chosen": -0.009632068686187267, "logits/rejected": 0.08833660930395126, "logps/chosen": -1.725791573524475, "logps/rejected": -3.4090161323547363, "loss": 0.47, "rewards/accuracies": 0.9375, "rewards/chosen": -1.725791573524475, "rewards/margins": 1.6832244396209717, "rewards/rejected": -3.4090161323547363, "sft_loss": 1.6864168643951416, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 10.02228452837305, "learning_rate": 8.390299200140712e-07, "logits/chosen": -0.29934918880462646, "logits/rejected": -0.11233566701412201, "logps/chosen": -1.8083289861679077, "logps/rejected": -3.242682695388794, "loss": 0.544, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8083289861679077, "rewards/margins": 1.4343535900115967, "rewards/rejected": -3.242682695388794, "sft_loss": 1.780664086341858, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 12.221367003206197, "learning_rate": 8.348389826678129e-07, "logits/chosen": -0.32494932413101196, "logits/rejected": 0.01025162823498249, "logps/chosen": -1.8250830173492432, "logps/rejected": -3.383464813232422, "loss": 0.5409, "rewards/accuracies": 0.875, "rewards/chosen": -1.8250830173492432, "rewards/margins": 1.5583815574645996, "rewards/rejected": -3.383464813232422, "sft_loss": 1.8907978534698486, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 8.50218965044455, "learning_rate": 8.306544987166615e-07, "logits/chosen": -0.23265402019023895, "logits/rejected": -0.1387948989868164, "logps/chosen": -1.766627311706543, "logps/rejected": -3.334089756011963, "loss": 0.5369, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.766627311706543, "rewards/margins": 1.56746244430542, "rewards/rejected": -3.334089756011963, "sft_loss": 1.8673334121704102, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 16.15377170158126, "learning_rate": 8.264765087584998e-07, "logits/chosen": -0.34037113189697266, "logits/rejected": -0.09007980674505234, "logps/chosen": -1.9173988103866577, "logps/rejected": -3.484591007232666, "loss": 0.5551, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9173988103866577, "rewards/margins": 1.5671921968460083, "rewards/rejected": -3.484591007232666, "sft_loss": 1.9438207149505615, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 9.822403017507577, "learning_rate": 8.223050533282033e-07, "logits/chosen": -0.18690574169158936, "logits/rejected": 0.07956047356128693, "logps/chosen": -1.7812955379486084, "logps/rejected": -3.284822940826416, "loss": 0.5383, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.7812955379486084, "rewards/margins": 1.5035276412963867, "rewards/rejected": -3.284822940826416, "sft_loss": 1.862097144126892, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 9.613539376759954, "learning_rate": 8.181401728972522e-07, "logits/chosen": -0.1325015127658844, "logits/rejected": 0.08524034917354584, "logps/chosen": -1.7051302194595337, "logps/rejected": -3.2468390464782715, "loss": 0.5278, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.7051302194595337, "rewards/margins": 1.5417091846466064, "rewards/rejected": -3.2468390464782715, "sft_loss": 1.7802455425262451, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 8.044388765783284, "learning_rate": 8.139819078733338e-07, "logits/chosen": -0.37993818521499634, "logits/rejected": -0.01248829998075962, "logps/chosen": -1.835631012916565, "logps/rejected": -3.2788097858428955, "loss": 0.5376, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.835631012916565, "rewards/margins": 1.4431788921356201, "rewards/rejected": -3.2788097858428955, "sft_loss": 1.8542535305023193, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 10.448158474118653, "learning_rate": 8.098302985999547e-07, "logits/chosen": -0.2758844792842865, "logits/rejected": 0.0281488336622715, "logps/chosen": -1.7699769735336304, "logps/rejected": -3.043705463409424, "loss": 0.5792, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7699769735336304, "rewards/margins": 1.273728609085083, "rewards/rejected": -3.043705463409424, "sft_loss": 1.835963487625122, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 8.289442766496714, "learning_rate": 8.056853853560447e-07, "logits/chosen": -0.18996943533420563, "logits/rejected": 0.1399824470281601, "logps/chosen": -1.731579065322876, "logps/rejected": -3.3539230823516846, "loss": 0.5245, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.731579065322876, "rewards/margins": 1.6223437786102295, "rewards/rejected": -3.3539230823516846, "sft_loss": 1.7560529708862305, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 9.080199903118245, "learning_rate": 8.015472083555717e-07, "logits/chosen": -0.12962546944618225, "logits/rejected": 0.12912937998771667, "logps/chosen": -1.6572837829589844, "logps/rejected": -3.1103360652923584, "loss": 0.5277, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.6572837829589844, "rewards/margins": 1.453052282333374, "rewards/rejected": -3.1103360652923584, "sft_loss": 1.67610764503479, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 13.440542364143138, "learning_rate": 7.974158077471461e-07, "logits/chosen": -0.31929659843444824, "logits/rejected": -0.05511760711669922, "logps/chosen": -1.7283598184585571, "logps/rejected": -3.248244524002075, "loss": 0.5193, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.7283598184585571, "rewards/margins": 1.5198849439620972, "rewards/rejected": -3.248244524002075, "sft_loss": 1.7844501733779907, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 10.580595747927207, "learning_rate": 7.932912236136356e-07, "logits/chosen": -0.2645584046840668, "logits/rejected": -0.1220647320151329, "logps/chosen": -1.6255719661712646, "logps/rejected": -3.0554234981536865, "loss": 0.5271, "rewards/accuracies": 0.875, "rewards/chosen": -1.6255719661712646, "rewards/margins": 1.429851770401001, "rewards/rejected": -3.0554234981536865, "sft_loss": 1.7420114278793335, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 9.715742112103827, "learning_rate": 7.891734959717726e-07, "logits/chosen": -0.1833140105009079, "logits/rejected": -0.018009066581726074, "logps/chosen": -1.8586372137069702, "logps/rejected": -3.337434768676758, "loss": 0.5519, "rewards/accuracies": 0.84375, "rewards/chosen": -1.8586372137069702, "rewards/margins": 1.478797197341919, "rewards/rejected": -3.337434768676758, "sft_loss": 1.9200775623321533, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 10.608522009342899, "learning_rate": 7.850626647717698e-07, "logits/chosen": -0.24772553145885468, "logits/rejected": 0.03280012682080269, "logps/chosen": -1.644805908203125, "logps/rejected": -3.2546234130859375, "loss": 0.5095, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.644805908203125, "rewards/margins": 1.6098175048828125, "rewards/rejected": -3.2546234130859375, "sft_loss": 1.6860408782958984, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 7.856253604131477, "learning_rate": 7.809587698969282e-07, "logits/chosen": -0.22330811619758606, "logits/rejected": -0.01864556595683098, "logps/chosen": -1.6774189472198486, "logps/rejected": -3.3011314868927, "loss": 0.5318, "rewards/accuracies": 0.875, "rewards/chosen": -1.6774189472198486, "rewards/margins": 1.6237128973007202, "rewards/rejected": -3.3011314868927, "sft_loss": 1.7449238300323486, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 11.906725492881082, "learning_rate": 7.768618511632555e-07, "logits/chosen": -0.09854897111654282, "logits/rejected": 0.05596822500228882, "logps/chosen": -1.7890571355819702, "logps/rejected": -3.2758407592773438, "loss": 0.601, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7890571355819702, "rewards/margins": 1.4867838621139526, "rewards/rejected": -3.2758407592773438, "sft_loss": 1.907470703125, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 11.247823200509208, "learning_rate": 7.727719483190737e-07, "logits/chosen": -0.2568055987358093, "logits/rejected": 0.07877197861671448, "logps/chosen": -1.799020528793335, "logps/rejected": -3.212989091873169, "loss": 0.5964, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.799020528793335, "rewards/margins": 1.4139684438705444, "rewards/rejected": -3.212989091873169, "sft_loss": 1.7767369747161865, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 11.87217275292588, "learning_rate": 7.686891010446394e-07, "logits/chosen": -0.11951699107885361, "logits/rejected": -0.07706291973590851, "logps/chosen": -1.7854347229003906, "logps/rejected": -3.206778049468994, "loss": 0.5454, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7854347229003906, "rewards/margins": 1.421343445777893, "rewards/rejected": -3.206778049468994, "sft_loss": 1.8132820129394531, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 10.066601523454645, "learning_rate": 7.646133489517535e-07, "logits/chosen": -0.15235088765621185, "logits/rejected": -0.025425296276807785, "logps/chosen": -1.7517459392547607, "logps/rejected": -3.148427963256836, "loss": 0.5729, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7517459392547607, "rewards/margins": 1.3966816663742065, "rewards/rejected": -3.148427963256836, "sft_loss": 1.758193016052246, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 8.791899762296065, "learning_rate": 7.605447315833821e-07, "logits/chosen": -0.12592525780200958, "logits/rejected": 0.06427010893821716, "logps/chosen": -1.6173303127288818, "logps/rejected": -2.993237257003784, "loss": 0.5463, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6173303127288818, "rewards/margins": 1.3759069442749023, "rewards/rejected": -2.993237257003784, "sft_loss": 1.674353003501892, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 12.42157001012568, "learning_rate": 7.564832884132672e-07, "logits/chosen": -0.22605355083942413, "logits/rejected": 0.006971999071538448, "logps/chosen": -1.7891266345977783, "logps/rejected": -3.117318868637085, "loss": 0.5928, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7891266345977783, "rewards/margins": 1.328192114830017, "rewards/rejected": -3.117318868637085, "sft_loss": 1.857444167137146, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 12.580124209491172, "learning_rate": 7.524290588455499e-07, "logits/chosen": -0.16494357585906982, "logits/rejected": 0.06749799102544785, "logps/chosen": -1.7576053142547607, "logps/rejected": -3.490067720413208, "loss": 0.539, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7576053142547607, "rewards/margins": 1.7324621677398682, "rewards/rejected": -3.490067720413208, "sft_loss": 1.7450393438339233, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 9.97708561203968, "learning_rate": 7.483820822143816e-07, "logits/chosen": -0.18638131022453308, "logits/rejected": -0.03988290950655937, "logps/chosen": -1.6317148208618164, "logps/rejected": -3.1146957874298096, "loss": 0.5352, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6317148208618164, "rewards/margins": 1.4829809665679932, "rewards/rejected": -3.1146957874298096, "sft_loss": 1.704087257385254, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 9.443485900309117, "learning_rate": 7.443423977835487e-07, "logits/chosen": -0.3203321099281311, "logits/rejected": -0.032503314316272736, "logps/chosen": -1.73452627658844, "logps/rejected": -3.3747222423553467, "loss": 0.5313, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.73452627658844, "rewards/margins": 1.6401962041854858, "rewards/rejected": -3.3747222423553467, "sft_loss": 1.7296106815338135, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 9.112141025009626, "learning_rate": 7.403100447460861e-07, "logits/chosen": -0.20505361258983612, "logits/rejected": -0.07048022001981735, "logps/chosen": -1.780599594116211, "logps/rejected": -3.380220413208008, "loss": 0.55, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.780599594116211, "rewards/margins": 1.5996206998825073, "rewards/rejected": -3.380220413208008, "sft_loss": 1.783935785293579, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 14.024169574717062, "learning_rate": 7.36285062223902e-07, "logits/chosen": -0.1989278346300125, "logits/rejected": -0.06960990279912949, "logps/chosen": -1.6781647205352783, "logps/rejected": -3.392726182937622, "loss": 0.5032, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.6781647205352783, "rewards/margins": 1.7145617008209229, "rewards/rejected": -3.392726182937622, "sft_loss": 1.6347157955169678, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 10.594240097208568, "learning_rate": 7.322674892673931e-07, "logits/chosen": -0.23259444534778595, "logits/rejected": 0.062645822763443, "logps/chosen": -1.7942231893539429, "logps/rejected": -3.0695948600769043, "loss": 0.6059, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7942231893539429, "rewards/margins": 1.275371789932251, "rewards/rejected": -3.0695948600769043, "sft_loss": 1.895446538925171, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 11.512803044164448, "learning_rate": 7.282573648550709e-07, "logits/chosen": -0.09428633749485016, "logits/rejected": 0.13378095626831055, "logps/chosen": -1.7537353038787842, "logps/rejected": -3.412146806716919, "loss": 0.5365, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7537353038787842, "rewards/margins": 1.6584113836288452, "rewards/rejected": -3.412146806716919, "sft_loss": 1.7974199056625366, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 10.016323025375353, "learning_rate": 7.242547278931792e-07, "logits/chosen": -0.2842410206794739, "logits/rejected": -0.19029872119426727, "logps/chosen": -1.8298816680908203, "logps/rejected": -3.660538911819458, "loss": 0.5288, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.8298816680908203, "rewards/margins": 1.830657720565796, "rewards/rejected": -3.660538911819458, "sft_loss": 1.886203408241272, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 9.785118481954505, "learning_rate": 7.202596172153203e-07, "logits/chosen": -0.16824455559253693, "logits/rejected": -0.028798962011933327, "logps/chosen": -1.769097089767456, "logps/rejected": -3.587766647338867, "loss": 0.5097, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.769097089767456, "rewards/margins": 1.818669319152832, "rewards/rejected": -3.587766647338867, "sft_loss": 1.8450851440429688, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 9.842023108246895, "learning_rate": 7.162720715820742e-07, "logits/chosen": -0.19046644866466522, "logits/rejected": -0.0008801884832791984, "logps/chosen": -1.7467609643936157, "logps/rejected": -3.541994571685791, "loss": 0.5272, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.7467609643936157, "rewards/margins": 1.7952334880828857, "rewards/rejected": -3.541994571685791, "sft_loss": 1.8788368701934814, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 10.73183612635801, "learning_rate": 7.122921296806278e-07, "logits/chosen": -0.1647910624742508, "logits/rejected": -0.01739250309765339, "logps/chosen": -1.7866592407226562, "logps/rejected": -3.479426145553589, "loss": 0.5402, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7866592407226562, "rewards/margins": 1.6927671432495117, "rewards/rejected": -3.479426145553589, "sft_loss": 1.9050674438476562, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 11.075635255908587, "learning_rate": 7.083198301243937e-07, "logits/chosen": -0.13357248902320862, "logits/rejected": 0.06032276153564453, "logps/chosen": -1.6821733713150024, "logps/rejected": -3.0180554389953613, "loss": 0.5297, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.6821733713150024, "rewards/margins": 1.3358821868896484, "rewards/rejected": -3.0180554389953613, "sft_loss": 1.7494080066680908, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 9.903602734776914, "learning_rate": 7.043552114526395e-07, "logits/chosen": -0.20195958018302917, "logits/rejected": -0.04661129415035248, "logps/chosen": -1.648164987564087, "logps/rejected": -3.37589693069458, "loss": 0.5143, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.648164987564087, "rewards/margins": 1.7277319431304932, "rewards/rejected": -3.37589693069458, "sft_loss": 1.8039286136627197, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 14.3969385350356, "learning_rate": 7.003983121301139e-07, "logits/chosen": -0.30059370398521423, "logits/rejected": -0.1018049344420433, "logps/chosen": -1.7925945520401, "logps/rejected": -3.5357577800750732, "loss": 0.5136, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.7925945520401, "rewards/margins": 1.7431633472442627, "rewards/rejected": -3.5357577800750732, "sft_loss": 1.8840261697769165, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 12.362747827933322, "learning_rate": 6.964491705466704e-07, "logits/chosen": -0.30404019355773926, "logits/rejected": -0.06907346099615097, "logps/chosen": -1.7132093906402588, "logps/rejected": -3.352640151977539, "loss": 0.5358, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7132093906402588, "rewards/margins": 1.6394307613372803, "rewards/rejected": -3.352640151977539, "sft_loss": 1.7619565725326538, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 11.567400767912353, "learning_rate": 6.92507825016899e-07, "logits/chosen": -0.3080520033836365, "logits/rejected": 0.12285199016332626, "logps/chosen": -1.803656816482544, "logps/rejected": -3.43890643119812, "loss": 0.549, "rewards/accuracies": 0.875, "rewards/chosen": -1.803656816482544, "rewards/margins": 1.6352497339248657, "rewards/rejected": -3.43890643119812, "sft_loss": 1.87128484249115, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 8.353058020832075, "learning_rate": 6.885743137797502e-07, "logits/chosen": -0.125919371843338, "logits/rejected": 0.018805870786309242, "logps/chosen": -1.6860202550888062, "logps/rejected": -3.4220077991485596, "loss": 0.5342, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6860202550888062, "rewards/margins": 1.735987901687622, "rewards/rejected": -3.4220077991485596, "sft_loss": 1.7403955459594727, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": 0.4547114670276642, "eval_logits/rejected": 0.6009575128555298, "eval_logps/chosen": -1.995704174041748, "eval_logps/rejected": -2.9729912281036377, "eval_loss": 0.7365527153015137, "eval_rewards/accuracies": 0.6965875625610352, "eval_rewards/chosen": -1.995704174041748, "eval_rewards/margins": 0.9772871136665344, "eval_rewards/rejected": -2.9729912281036377, "eval_runtime": 50.3486, "eval_samples_per_second": 26.714, "eval_sft_loss": 1.9275230169296265, "eval_steps_per_second": 6.693, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 9.355836024773891, "learning_rate": 6.846486749981684e-07, "logits/chosen": -0.13395535945892334, "logits/rejected": 0.21117381751537323, "logps/chosen": -1.8070415258407593, "logps/rejected": -3.1795265674591064, "loss": 0.5625, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8070415258407593, "rewards/margins": 1.372484803199768, "rewards/rejected": -3.1795265674591064, "sft_loss": 1.7828906774520874, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 8.776826626684906, "learning_rate": 6.807309467587173e-07, "logits/chosen": -0.1883133351802826, "logits/rejected": -0.03017752803862095, "logps/chosen": -1.6705434322357178, "logps/rejected": -3.0100858211517334, "loss": 0.5629, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6705434322357178, "rewards/margins": 1.3395423889160156, "rewards/rejected": -3.0100858211517334, "sft_loss": 1.7361271381378174, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 13.296893227824357, "learning_rate": 6.768211670712146e-07, "logits/chosen": -0.17326074838638306, "logits/rejected": 0.22264091670513153, "logps/chosen": -1.7377557754516602, "logps/rejected": -3.068744659423828, "loss": 0.5836, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7377557754516602, "rewards/margins": 1.3309885263442993, "rewards/rejected": -3.068744659423828, "sft_loss": 1.7873904705047607, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 10.847948103374218, "learning_rate": 6.729193738683589e-07, "logits/chosen": -0.2886696457862854, "logits/rejected": -0.07660894840955734, "logps/chosen": -1.813215970993042, "logps/rejected": -3.3827805519104004, "loss": 0.5688, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.813215970993042, "rewards/margins": 1.5695644617080688, "rewards/rejected": -3.3827805519104004, "sft_loss": 1.8843132257461548, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 10.189727666685554, "learning_rate": 6.690256050053652e-07, "logits/chosen": -0.16914470493793488, "logits/rejected": 0.016266096383333206, "logps/chosen": -1.7264938354492188, "logps/rejected": -3.296316146850586, "loss": 0.5369, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.7264938354492188, "rewards/margins": 1.5698221921920776, "rewards/rejected": -3.296316146850586, "sft_loss": 1.772139549255371, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 11.467675187382033, "learning_rate": 6.651398982595967e-07, "logits/chosen": -0.220795676112175, "logits/rejected": -0.10813482105731964, "logps/chosen": -1.7274131774902344, "logps/rejected": -3.434593677520752, "loss": 0.5267, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7274131774902344, "rewards/margins": 1.7071806192398071, "rewards/rejected": -3.434593677520752, "sft_loss": 1.786871314048767, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 10.839498118605151, "learning_rate": 6.612622913301961e-07, "logits/chosen": -0.12224410474300385, "logits/rejected": -0.11186661571264267, "logps/chosen": -1.654087781906128, "logps/rejected": -2.9117302894592285, "loss": 0.5797, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.654087781906128, "rewards/margins": 1.257642149925232, "rewards/rejected": -2.9117302894592285, "sft_loss": 1.757502794265747, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 20.997272379592218, "learning_rate": 6.573928218377243e-07, "logits/chosen": -0.1622733771800995, "logits/rejected": -0.13422775268554688, "logps/chosen": -1.6278194189071655, "logps/rejected": -3.17354154586792, "loss": 0.5248, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.6278194189071655, "rewards/margins": 1.5457221269607544, "rewards/rejected": -3.17354154586792, "sft_loss": 1.6407041549682617, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 11.928275021931617, "learning_rate": 6.5353152732379e-07, "logits/chosen": -0.12512122094631195, "logits/rejected": 0.0744471549987793, "logps/chosen": -1.7665351629257202, "logps/rejected": -3.165983200073242, "loss": 0.5881, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7665351629257202, "rewards/margins": 1.3994477987289429, "rewards/rejected": -3.165983200073242, "sft_loss": 1.835588812828064, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 15.500424986031952, "learning_rate": 6.496784452506907e-07, "logits/chosen": -0.2709681987762451, "logits/rejected": -0.051780473440885544, "logps/chosen": -1.868085503578186, "logps/rejected": -3.219057083129883, "loss": 0.6104, "rewards/accuracies": 0.8125, "rewards/chosen": -1.868085503578186, "rewards/margins": 1.3509716987609863, "rewards/rejected": -3.219057083129883, "sft_loss": 1.995994210243225, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 12.45975798521115, "learning_rate": 6.458336130010442e-07, "logits/chosen": -0.08536555618047714, "logits/rejected": 0.01686207577586174, "logps/chosen": -1.7968616485595703, "logps/rejected": -2.991499662399292, "loss": 0.5666, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.7968616485595703, "rewards/margins": 1.1946380138397217, "rewards/rejected": -2.991499662399292, "sft_loss": 1.8622022867202759, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 10.50043219283165, "learning_rate": 6.419970678774312e-07, "logits/chosen": -0.09494493901729584, "logits/rejected": 0.17411960661411285, "logps/chosen": -1.7018855810165405, "logps/rejected": -3.1860663890838623, "loss": 0.5551, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7018855810165405, "rewards/margins": 1.4841810464859009, "rewards/rejected": -3.1860663890838623, "sft_loss": 1.834429383277893, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 13.040927123294718, "learning_rate": 6.381688471020282e-07, "logits/chosen": -0.1964532434940338, "logits/rejected": -0.075645811855793, "logps/chosen": -1.725067138671875, "logps/rejected": -3.4409821033477783, "loss": 0.5163, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.725067138671875, "rewards/margins": 1.7159149646759033, "rewards/rejected": -3.4409821033477783, "sft_loss": 1.7923214435577393, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 10.862096898834187, "learning_rate": 6.34348987816251e-07, "logits/chosen": -0.09906373918056488, "logits/rejected": 0.2573404610157013, "logps/chosen": -1.6807048320770264, "logps/rejected": -3.358582019805908, "loss": 0.5648, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6807048320770264, "rewards/margins": 1.6778767108917236, "rewards/rejected": -3.358582019805908, "sft_loss": 1.8171613216400146, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 9.631800757457958, "learning_rate": 6.3053752708039e-07, "logits/chosen": -0.15538927912712097, "logits/rejected": 0.13674817979335785, "logps/chosen": -1.7523047924041748, "logps/rejected": -3.158905506134033, "loss": 0.5498, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.7523047924041748, "rewards/margins": 1.4066009521484375, "rewards/rejected": -3.158905506134033, "sft_loss": 1.7659852504730225, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 10.829533507448382, "learning_rate": 6.267345018732552e-07, "logits/chosen": -0.17978882789611816, "logits/rejected": 0.055534325540065765, "logps/chosen": -1.9052941799163818, "logps/rejected": -3.4997200965881348, "loss": 0.5773, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9052941799163818, "rewards/margins": 1.5944254398345947, "rewards/rejected": -3.4997200965881348, "sft_loss": 1.9153798818588257, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 10.35769049326784, "learning_rate": 6.229399490918126e-07, "logits/chosen": -0.05187790468335152, "logits/rejected": 0.01336099486798048, "logps/chosen": -1.7664661407470703, "logps/rejected": -3.2258377075195312, "loss": 0.538, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.7664661407470703, "rewards/margins": 1.459371566772461, "rewards/rejected": -3.2258377075195312, "sft_loss": 1.7747611999511719, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 13.643349299342804, "learning_rate": 6.19153905550831e-07, "logits/chosen": -0.3395062983036041, "logits/rejected": -0.01758493110537529, "logps/chosen": -1.7421958446502686, "logps/rejected": -3.3181235790252686, "loss": 0.5333, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.7421958446502686, "rewards/margins": 1.575927972793579, "rewards/rejected": -3.3181235790252686, "sft_loss": 1.83111572265625, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 12.224809135626256, "learning_rate": 6.153764079825211e-07, "logits/chosen": -0.2562378942966461, "logits/rejected": -0.11864246428012848, "logps/chosen": -1.9074604511260986, "logps/rejected": -3.3651721477508545, "loss": 0.5714, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9074604511260986, "rewards/margins": 1.4577118158340454, "rewards/rejected": -3.3651721477508545, "sft_loss": 1.9094829559326172, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 13.238715949649285, "learning_rate": 6.116074930361803e-07, "logits/chosen": -0.1316947191953659, "logits/rejected": 0.1430431455373764, "logps/chosen": -1.772538185119629, "logps/rejected": -3.622473955154419, "loss": 0.4966, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.772538185119629, "rewards/margins": 1.849935531616211, "rewards/rejected": -3.622473955154419, "sft_loss": 1.9160066843032837, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 13.230339307295868, "learning_rate": 6.078471972778388e-07, "logits/chosen": -0.1815367043018341, "logits/rejected": 0.12857958674430847, "logps/chosen": -2.000633716583252, "logps/rejected": -3.649420976638794, "loss": 0.5479, "rewards/accuracies": 0.84375, "rewards/chosen": -2.000633716583252, "rewards/margins": 1.648787260055542, "rewards/rejected": -3.649420976638794, "sft_loss": 1.9365339279174805, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 23.65904937913438, "learning_rate": 6.040955571899018e-07, "logits/chosen": -0.1651362031698227, "logits/rejected": 0.1419135481119156, "logps/chosen": -1.855017066001892, "logps/rejected": -3.6009979248046875, "loss": 0.5504, "rewards/accuracies": 0.84375, "rewards/chosen": -1.855017066001892, "rewards/margins": 1.7459806203842163, "rewards/rejected": -3.6009979248046875, "sft_loss": 1.9201616048812866, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 12.137879991792099, "learning_rate": 6.003526091707986e-07, "logits/chosen": -0.10553675889968872, "logits/rejected": 0.04338879883289337, "logps/chosen": -1.8180103302001953, "logps/rejected": -3.356022596359253, "loss": 0.5322, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.8180103302001953, "rewards/margins": 1.538012146949768, "rewards/rejected": -3.356022596359253, "sft_loss": 1.8594329357147217, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 10.662598877065893, "learning_rate": 5.966183895346264e-07, "logits/chosen": -0.17935439944267273, "logits/rejected": 0.005321676842868328, "logps/chosen": -1.747471809387207, "logps/rejected": -3.3066420555114746, "loss": 0.5446, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.747471809387207, "rewards/margins": 1.5591704845428467, "rewards/rejected": -3.3066420555114746, "sft_loss": 1.8495490550994873, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 13.506934757683247, "learning_rate": 5.928929345108015e-07, "logits/chosen": -0.24749357998371124, "logits/rejected": 0.08834774792194366, "logps/chosen": -1.7615807056427002, "logps/rejected": -3.527405261993408, "loss": 0.5179, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7615807056427002, "rewards/margins": 1.765824556350708, "rewards/rejected": -3.527405261993408, "sft_loss": 1.8211628198623657, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 11.390020408641043, "learning_rate": 5.891762802437039e-07, "logits/chosen": -0.07019664347171783, "logits/rejected": 0.07711862027645111, "logps/chosen": -1.8021306991577148, "logps/rejected": -3.505963087081909, "loss": 0.5491, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.8021306991577148, "rewards/margins": 1.7038323879241943, "rewards/rejected": -3.505963087081909, "sft_loss": 1.889589548110962, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 9.145040462123765, "learning_rate": 5.854684627923306e-07, "logits/chosen": -0.009453767910599709, "logits/rejected": -0.058390479534864426, "logps/chosen": -1.8435354232788086, "logps/rejected": -3.6739730834960938, "loss": 0.587, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8435354232788086, "rewards/margins": 1.830437421798706, "rewards/rejected": -3.6739730834960938, "sft_loss": 1.9032466411590576, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 11.307495342011872, "learning_rate": 5.817695181299418e-07, "logits/chosen": -0.3465738594532013, "logits/rejected": -0.2500944435596466, "logps/chosen": -1.727786660194397, "logps/rejected": -3.165914535522461, "loss": 0.5609, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.727786660194397, "rewards/margins": 1.4381277561187744, "rewards/rejected": -3.165914535522461, "sft_loss": 1.7594287395477295, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 14.285651119113195, "learning_rate": 5.780794821437158e-07, "logits/chosen": 0.0007110525039024651, "logits/rejected": 0.23650093376636505, "logps/chosen": -1.7984707355499268, "logps/rejected": -3.303368330001831, "loss": 0.549, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7984707355499268, "rewards/margins": 1.5048975944519043, "rewards/rejected": -3.303368330001831, "sft_loss": 1.8620703220367432, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 9.87744380658177, "learning_rate": 5.743983906343969e-07, "logits/chosen": -0.16267967224121094, "logits/rejected": 0.022295860573649406, "logps/chosen": -1.6229957342147827, "logps/rejected": -3.110755205154419, "loss": 0.5272, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.6229957342147827, "rewards/margins": 1.4877592325210571, "rewards/rejected": -3.110755205154419, "sft_loss": 1.7183735370635986, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 11.17666651918986, "learning_rate": 5.707262793159521e-07, "logits/chosen": -0.07476671040058136, "logits/rejected": -0.0714937299489975, "logps/chosen": -1.7283849716186523, "logps/rejected": -3.067058563232422, "loss": 0.5599, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.7283849716186523, "rewards/margins": 1.3386733531951904, "rewards/rejected": -3.067058563232422, "sft_loss": 1.735907793045044, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 12.837943811048008, "learning_rate": 5.670631838152204e-07, "logits/chosen": -0.15067186951637268, "logits/rejected": 0.055976539850234985, "logps/chosen": -1.7886425256729126, "logps/rejected": -3.221102237701416, "loss": 0.5309, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.7886425256729126, "rewards/margins": 1.432459831237793, "rewards/rejected": -3.221102237701416, "sft_loss": 1.831111192703247, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 11.290213521591763, "learning_rate": 5.634091396715716e-07, "logits/chosen": -0.14092954993247986, "logits/rejected": 0.06754940748214722, "logps/chosen": -1.7412573099136353, "logps/rejected": -3.4222846031188965, "loss": 0.5348, "rewards/accuracies": 0.875, "rewards/chosen": -1.7412573099136353, "rewards/margins": 1.6810274124145508, "rewards/rejected": -3.4222846031188965, "sft_loss": 1.834660530090332, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 14.537318801359874, "learning_rate": 5.59764182336557e-07, "logits/chosen": 0.06082998961210251, "logits/rejected": 0.15074776113033295, "logps/chosen": -1.8112719058990479, "logps/rejected": -3.479727268218994, "loss": 0.5349, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.8112719058990479, "rewards/margins": 1.668454885482788, "rewards/rejected": -3.479727268218994, "sft_loss": 1.9149051904678345, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 13.515429797572333, "learning_rate": 5.561283471735695e-07, "logits/chosen": -0.1406007707118988, "logits/rejected": -0.0066071366891264915, "logps/chosen": -1.7152063846588135, "logps/rejected": -3.051826000213623, "loss": 0.555, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7152063846588135, "rewards/margins": 1.3366196155548096, "rewards/rejected": -3.051826000213623, "sft_loss": 1.7660102844238281, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 8.484953749577219, "learning_rate": 5.52501669457497e-07, "logits/chosen": -0.23328259587287903, "logits/rejected": 0.12310652434825897, "logps/chosen": -1.715920090675354, "logps/rejected": -3.359726667404175, "loss": 0.5203, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.715920090675354, "rewards/margins": 1.6438068151474, "rewards/rejected": -3.359726667404175, "sft_loss": 1.79128897190094, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 12.362665149402407, "learning_rate": 5.488841843743833e-07, "logits/chosen": -0.16568274796009064, "logits/rejected": -0.1361156404018402, "logps/chosen": -1.6876897811889648, "logps/rejected": -3.312028408050537, "loss": 0.5408, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6876897811889648, "rewards/margins": 1.6243385076522827, "rewards/rejected": -3.312028408050537, "sft_loss": 1.7390552759170532, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 10.878629371047102, "learning_rate": 5.452759270210839e-07, "logits/chosen": 0.02488498017191887, "logits/rejected": 0.15020744502544403, "logps/chosen": -1.7068469524383545, "logps/rejected": -3.4873459339141846, "loss": 0.5362, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7068469524383545, "rewards/margins": 1.7804988622665405, "rewards/rejected": -3.4873459339141846, "sft_loss": 1.7590528726577759, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 12.157426108313896, "learning_rate": 5.416769324049282e-07, "logits/chosen": -0.2870050072669983, "logits/rejected": -0.06136869266629219, "logps/chosen": -1.7316116094589233, "logps/rejected": -3.0164847373962402, "loss": 0.5823, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7316116094589233, "rewards/margins": 1.2848728895187378, "rewards/rejected": -3.0164847373962402, "sft_loss": 1.808619737625122, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 8.79018380961107, "learning_rate": 5.38087235443377e-07, "logits/chosen": 0.06400427967309952, "logits/rejected": 0.05108444765210152, "logps/chosen": -1.8447471857070923, "logps/rejected": -3.415586471557617, "loss": 0.5767, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8447471857070923, "rewards/margins": 1.570839285850525, "rewards/rejected": -3.415586471557617, "sft_loss": 1.8895288705825806, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 14.225038986307883, "learning_rate": 5.345068709636866e-07, "logits/chosen": -0.19258277118206024, "logits/rejected": -0.09260249137878418, "logps/chosen": -1.691738486289978, "logps/rejected": -3.1312484741210938, "loss": 0.5474, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.691738486289978, "rewards/margins": 1.4395099878311157, "rewards/rejected": -3.1312484741210938, "sft_loss": 1.6973931789398193, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 13.30953173328577, "learning_rate": 5.309358737025682e-07, "logits/chosen": -0.1457730382680893, "logits/rejected": 0.04253808781504631, "logps/chosen": -1.786547064781189, "logps/rejected": -3.726893186569214, "loss": 0.5525, "rewards/accuracies": 0.8125, "rewards/chosen": -1.786547064781189, "rewards/margins": 1.9403464794158936, "rewards/rejected": -3.726893186569214, "sft_loss": 1.837989091873169, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 12.139194174292625, "learning_rate": 5.273742783058537e-07, "logits/chosen": -0.09968717396259308, "logits/rejected": 0.1227853074669838, "logps/chosen": -1.7654327154159546, "logps/rejected": -3.4704456329345703, "loss": 0.5415, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7654327154159546, "rewards/margins": 1.7050129175186157, "rewards/rejected": -3.4704456329345703, "sft_loss": 1.7903417348861694, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 9.577959973939867, "learning_rate": 5.23822119328157e-07, "logits/chosen": -0.2234119474887848, "logits/rejected": 0.1251845508813858, "logps/chosen": -1.6994167566299438, "logps/rejected": -3.445323944091797, "loss": 0.5213, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.6994167566299438, "rewards/margins": 1.7459074258804321, "rewards/rejected": -3.445323944091797, "sft_loss": 1.7618118524551392, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 8.35456484773565, "learning_rate": 5.202794312325399e-07, "logits/chosen": -0.18785110116004944, "logits/rejected": 0.2039102017879486, "logps/chosen": -1.8394941091537476, "logps/rejected": -3.561155319213867, "loss": 0.5432, "rewards/accuracies": 0.875, "rewards/chosen": -1.8394941091537476, "rewards/margins": 1.7216612100601196, "rewards/rejected": -3.561155319213867, "sft_loss": 1.8366172313690186, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 12.0073634491718, "learning_rate": 5.167462483901773e-07, "logits/chosen": -0.16625425219535828, "logits/rejected": -0.0023239790461957455, "logps/chosen": -1.8147470951080322, "logps/rejected": -3.441155195236206, "loss": 0.5651, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8147470951080322, "rewards/margins": 1.626408576965332, "rewards/rejected": -3.441155195236206, "sft_loss": 1.7933012247085571, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 9.280989355670226, "learning_rate": 5.132226050800256e-07, "logits/chosen": -0.10036615282297134, "logits/rejected": 0.023008223623037338, "logps/chosen": -1.7863056659698486, "logps/rejected": -3.2340240478515625, "loss": 0.5709, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7863056659698486, "rewards/margins": 1.447718620300293, "rewards/rejected": -3.2340240478515625, "sft_loss": 1.8817899227142334, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 9.99760092205392, "learning_rate": 5.097085354884869e-07, "logits/chosen": -0.10582532733678818, "logits/rejected": 0.061595212668180466, "logps/chosen": -1.7237050533294678, "logps/rejected": -3.2690975666046143, "loss": 0.5349, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7237050533294678, "rewards/margins": 1.5453920364379883, "rewards/rejected": -3.2690975666046143, "sft_loss": 1.8347012996673584, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 16.739494504756493, "learning_rate": 5.062040737090806e-07, "logits/chosen": -0.2041359394788742, "logits/rejected": 0.05440496653318405, "logps/chosen": -1.8450266122817993, "logps/rejected": -3.412508487701416, "loss": 0.5655, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.8450266122817993, "rewards/margins": 1.5674818754196167, "rewards/rejected": -3.412508487701416, "sft_loss": 1.886196494102478, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 11.652650195235836, "learning_rate": 5.027092537421091e-07, "logits/chosen": -0.13598737120628357, "logits/rejected": 0.16030022501945496, "logps/chosen": -1.831965684890747, "logps/rejected": -3.417522430419922, "loss": 0.5562, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.831965684890747, "rewards/margins": 1.5855567455291748, "rewards/rejected": -3.417522430419922, "sft_loss": 1.8175779581069946, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 17.4581711422509, "learning_rate": 4.992241094943326e-07, "logits/chosen": -0.1738833487033844, "logits/rejected": 0.28060057759284973, "logps/chosen": -1.8021186590194702, "logps/rejected": -3.683699131011963, "loss": 0.5179, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8021186590194702, "rewards/margins": 1.8815805912017822, "rewards/rejected": -3.683699131011963, "sft_loss": 1.8290512561798096, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 13.679028455286472, "learning_rate": 4.957486747786342e-07, "logits/chosen": -0.05840907618403435, "logits/rejected": 0.07495447248220444, "logps/chosen": -1.7225723266601562, "logps/rejected": -3.1784443855285645, "loss": 0.5278, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.7225723266601562, "rewards/margins": 1.4558724164962769, "rewards/rejected": -3.1784443855285645, "sft_loss": 1.7190653085708618, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 10.263336372814239, "learning_rate": 4.922829833136984e-07, "logits/chosen": -0.2786995768547058, "logits/rejected": 0.02108999527990818, "logps/chosen": -1.7660636901855469, "logps/rejected": -3.4693591594696045, "loss": 0.5368, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7660636901855469, "rewards/margins": 1.7032954692840576, "rewards/rejected": -3.4693591594696045, "sft_loss": 1.8371864557266235, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 14.78777525659592, "learning_rate": 4.888270687236773e-07, "logits/chosen": -0.0698157548904419, "logits/rejected": 0.33996957540512085, "logps/chosen": -1.844203233718872, "logps/rejected": -3.4948344230651855, "loss": 0.5632, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.844203233718872, "rewards/margins": 1.650631308555603, "rewards/rejected": -3.4948344230651855, "sft_loss": 1.8448089361190796, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 11.605848690657226, "learning_rate": 4.853809645378709e-07, "logits/chosen": -0.12247265875339508, "logits/rejected": 0.07382510602474213, "logps/chosen": -1.9160645008087158, "logps/rejected": -3.572585344314575, "loss": 0.5848, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9160645008087158, "rewards/margins": 1.656521201133728, "rewards/rejected": -3.572585344314575, "sft_loss": 2.0068259239196777, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 11.465961025206745, "learning_rate": 4.81944704190396e-07, "logits/chosen": -0.19016000628471375, "logits/rejected": -0.025381360203027725, "logps/chosen": -1.7586004734039307, "logps/rejected": -3.3733391761779785, "loss": 0.5428, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7586004734039307, "rewards/margins": 1.6147382259368896, "rewards/rejected": -3.3733391761779785, "sft_loss": 1.859588623046875, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 13.012619295863841, "learning_rate": 4.785183210198667e-07, "logits/chosen": -0.0383647158741951, "logits/rejected": -0.06396958976984024, "logps/chosen": -1.7566505670547485, "logps/rejected": -3.5018608570098877, "loss": 0.5177, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.7566505670547485, "rewards/margins": 1.7452104091644287, "rewards/rejected": -3.5018608570098877, "sft_loss": 1.855015754699707, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 13.414407789673984, "learning_rate": 4.7510184826906626e-07, "logits/chosen": -0.22793325781822205, "logits/rejected": 0.05166008323431015, "logps/chosen": -1.9015623331069946, "logps/rejected": -3.5119667053222656, "loss": 0.5761, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.9015623331069946, "rewards/margins": 1.6104040145874023, "rewards/rejected": -3.5119667053222656, "sft_loss": 1.9202884435653687, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 12.77803177041485, "learning_rate": 4.7169531908462953e-07, "logits/chosen": -0.1987219750881195, "logits/rejected": -0.06591467559337616, "logps/chosen": -1.8806768655776978, "logps/rejected": -3.3210761547088623, "loss": 0.569, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8806768655776978, "rewards/margins": 1.4403992891311646, "rewards/rejected": -3.3210761547088623, "sft_loss": 1.8865550756454468, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 8.365346335648512, "learning_rate": 4.6829876651671636e-07, "logits/chosen": -0.047934405505657196, "logits/rejected": 0.14092543721199036, "logps/chosen": -1.7616697549819946, "logps/rejected": -3.305771589279175, "loss": 0.5396, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7616697549819946, "rewards/margins": 1.5441023111343384, "rewards/rejected": -3.305771589279175, "sft_loss": 1.7759618759155273, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 16.408527609658467, "learning_rate": 4.64912223518696e-07, "logits/chosen": -0.17129859328269958, "logits/rejected": 0.026264000684022903, "logps/chosen": -1.8189198970794678, "logps/rejected": -3.6464335918426514, "loss": 0.5141, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.8189198970794678, "rewards/margins": 1.8275134563446045, "rewards/rejected": -3.6464335918426514, "sft_loss": 1.9214423894882202, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 8.565563478898516, "learning_rate": 4.615357229468221e-07, "logits/chosen": -0.17534136772155762, "logits/rejected": 0.1626088172197342, "logps/chosen": -1.7512375116348267, "logps/rejected": -3.546313762664795, "loss": 0.5015, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.7512375116348267, "rewards/margins": 1.7950763702392578, "rewards/rejected": -3.546313762664795, "sft_loss": 1.7669951915740967, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 12.406748421040726, "learning_rate": 4.581692975599192e-07, "logits/chosen": -0.14899741113185883, "logits/rejected": 0.1268022507429123, "logps/chosen": -1.8081623315811157, "logps/rejected": -3.2024803161621094, "loss": 0.5774, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.8081623315811157, "rewards/margins": 1.3943179845809937, "rewards/rejected": -3.2024803161621094, "sft_loss": 1.8954441547393799, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 7.905745882689756, "learning_rate": 4.548129800190603e-07, "logits/chosen": -0.19442422688007355, "logits/rejected": 0.04682071506977081, "logps/chosen": -1.7260282039642334, "logps/rejected": -3.445497989654541, "loss": 0.5062, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.7260282039642334, "rewards/margins": 1.719469428062439, "rewards/rejected": -3.445497989654541, "sft_loss": 1.78412663936615, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 14.716758462611983, "learning_rate": 4.5146680288725367e-07, "logits/chosen": -0.17263540625572205, "logits/rejected": 0.0990188717842102, "logps/chosen": -1.7226759195327759, "logps/rejected": -3.3513336181640625, "loss": 0.5579, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7226759195327759, "rewards/margins": 1.6286576986312866, "rewards/rejected": -3.3513336181640625, "sft_loss": 1.8069121837615967, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 9.244420649045196, "learning_rate": 4.481307986291237e-07, "logits/chosen": -0.1909504383802414, "logits/rejected": 0.000193992251297459, "logps/chosen": -1.854528784751892, "logps/rejected": -3.418144941329956, "loss": 0.5682, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.854528784751892, "rewards/margins": 1.5636165142059326, "rewards/rejected": -3.418144941329956, "sft_loss": 1.8442004919052124, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 12.451343095718101, "learning_rate": 4.4480499961059915e-07, "logits/chosen": -0.13839785754680634, "logits/rejected": -0.007095733191817999, "logps/chosen": -1.7823936939239502, "logps/rejected": -3.1268227100372314, "loss": 0.5734, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7823936939239502, "rewards/margins": 1.3444294929504395, "rewards/rejected": -3.1268227100372314, "sft_loss": 1.7421305179595947, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 9.502520310548556, "learning_rate": 4.414894380985959e-07, "logits/chosen": -0.2586430013179779, "logits/rejected": 0.06767354905605316, "logps/chosen": -1.6843284368515015, "logps/rejected": -3.4498672485351562, "loss": 0.5072, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.6843284368515015, "rewards/margins": 1.7655388116836548, "rewards/rejected": -3.4498672485351562, "sft_loss": 1.7884721755981445, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 13.702985197452257, "learning_rate": 4.3818414626070703e-07, "logits/chosen": -0.1704205572605133, "logits/rejected": -0.08272019028663635, "logps/chosen": -1.8510723114013672, "logps/rejected": -3.326447010040283, "loss": 0.5778, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8510723114013672, "rewards/margins": 1.475374460220337, "rewards/rejected": -3.326447010040283, "sft_loss": 1.8802316188812256, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 15.894625661985877, "learning_rate": 4.3488915616488757e-07, "logits/chosen": -0.094416543841362, "logits/rejected": -0.005819836165755987, "logps/chosen": -1.8883235454559326, "logps/rejected": -3.532397747039795, "loss": 0.5487, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8883235454559326, "rewards/margins": 1.6440744400024414, "rewards/rejected": -3.532397747039795, "sft_loss": 1.9055078029632568, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 10.532115045678573, "learning_rate": 4.316044997791469e-07, "logits/chosen": -0.25973159074783325, "logits/rejected": -0.03989795595407486, "logps/chosen": -1.8471174240112305, "logps/rejected": -3.4026947021484375, "loss": 0.5316, "rewards/accuracies": 0.90625, "rewards/chosen": -1.8471174240112305, "rewards/margins": 1.5555775165557861, "rewards/rejected": -3.4026947021484375, "sft_loss": 1.9095427989959717, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 9.625215379597115, "learning_rate": 4.283302089712348e-07, "logits/chosen": -0.23575392365455627, "logits/rejected": 0.14210402965545654, "logps/chosen": -1.812212586402893, "logps/rejected": -3.431943416595459, "loss": 0.5148, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.812212586402893, "rewards/margins": 1.6197311878204346, "rewards/rejected": -3.431943416595459, "sft_loss": 1.8462364673614502, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 11.218224313231298, "learning_rate": 4.250663155083357e-07, "logits/chosen": -0.07904136180877686, "logits/rejected": -0.06815527379512787, "logps/chosen": -1.7699768543243408, "logps/rejected": -3.287299394607544, "loss": 0.5626, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.7699768543243408, "rewards/margins": 1.5173224210739136, "rewards/rejected": -3.287299394607544, "sft_loss": 1.7975574731826782, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 14.30035500062535, "learning_rate": 4.218128510567578e-07, "logits/chosen": -0.1746416985988617, "logits/rejected": 0.035987790673971176, "logps/chosen": -1.700383186340332, "logps/rejected": -3.543325424194336, "loss": 0.4811, "rewards/accuracies": 0.875, "rewards/chosen": -1.700383186340332, "rewards/margins": 1.842942476272583, "rewards/rejected": -3.543325424194336, "sft_loss": 1.7460041046142578, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 14.162568631836546, "learning_rate": 4.185698471816279e-07, "logits/chosen": -0.2638034522533417, "logits/rejected": 0.07215817272663116, "logps/chosen": -1.8092349767684937, "logps/rejected": -3.4781856536865234, "loss": 0.5587, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8092349767684937, "rewards/margins": 1.6689503192901611, "rewards/rejected": -3.4781856536865234, "sft_loss": 1.8944950103759766, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 8.184554287863792, "learning_rate": 4.1533733534658326e-07, "logits/chosen": -0.20982761681079865, "logits/rejected": 0.12584097683429718, "logps/chosen": -1.7665157318115234, "logps/rejected": -3.4161884784698486, "loss": 0.5444, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7665157318115234, "rewards/margins": 1.6496728658676147, "rewards/rejected": -3.4161884784698486, "sft_loss": 1.8240835666656494, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 10.047872500559155, "learning_rate": 4.121153469134686e-07, "logits/chosen": -0.1830652505159378, "logits/rejected": 0.015270600095391273, "logps/chosen": -1.7029507160186768, "logps/rejected": -3.1996734142303467, "loss": 0.5528, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.7029507160186768, "rewards/margins": 1.4967228174209595, "rewards/rejected": -3.1996734142303467, "sft_loss": 1.7361400127410889, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 8.511297364088247, "learning_rate": 4.089039131420292e-07, "logits/chosen": -0.19872130453586578, "logits/rejected": -0.007671922445297241, "logps/chosen": -1.7023446559906006, "logps/rejected": -3.085238456726074, "loss": 0.5746, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7023446559906006, "rewards/margins": 1.3828939199447632, "rewards/rejected": -3.085238456726074, "sft_loss": 1.770695447921753, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 11.969921138929104, "learning_rate": 4.0570306518961027e-07, "logits/chosen": -0.1552981734275818, "logits/rejected": 0.1038542240858078, "logps/chosen": -1.7612078189849854, "logps/rejected": -3.5550270080566406, "loss": 0.5463, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7612078189849854, "rewards/margins": 1.7938191890716553, "rewards/rejected": -3.5550270080566406, "sft_loss": 1.792492151260376, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 9.20119844833594, "learning_rate": 4.025128341108517e-07, "logits/chosen": -0.2136857956647873, "logits/rejected": 0.02675015665590763, "logps/chosen": -1.8333107233047485, "logps/rejected": -3.263059616088867, "loss": 0.5733, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8333107233047485, "rewards/margins": 1.429748773574829, "rewards/rejected": -3.263059616088867, "sft_loss": 1.905210256576538, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": 0.44999179244041443, "eval_logits/rejected": 0.5968734622001648, "eval_logps/chosen": -2.064203977584839, "eval_logps/rejected": -3.1107640266418457, "eval_loss": 0.7454193830490112, "eval_rewards/accuracies": 0.7054896354675293, "eval_rewards/chosen": -2.064203977584839, "eval_rewards/margins": 1.0465601682662964, "eval_rewards/rejected": -3.1107640266418457, "eval_runtime": 46.4553, "eval_samples_per_second": 28.953, "eval_sft_loss": 1.9592351913452148, "eval_steps_per_second": 7.254, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 17.643542932785405, "learning_rate": 3.9933325085739047e-07, "logits/chosen": -0.2071973830461502, "logits/rejected": -0.1868351399898529, "logps/chosen": -1.6352430582046509, "logps/rejected": -3.1291136741638184, "loss": 0.508, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.6352430582046509, "rewards/margins": 1.4938703775405884, "rewards/rejected": -3.1291136741638184, "sft_loss": 1.6966040134429932, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 10.286590852030196, "learning_rate": 3.9616434627755624e-07, "logits/chosen": -0.1404072493314743, "logits/rejected": -0.02690928615629673, "logps/chosen": -1.9162696599960327, "logps/rejected": -3.735090970993042, "loss": 0.5324, "rewards/accuracies": 0.875, "rewards/chosen": -1.9162696599960327, "rewards/margins": 1.818821668624878, "rewards/rejected": -3.735090970993042, "sft_loss": 1.9391191005706787, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 9.090805142604072, "learning_rate": 3.930061511160762e-07, "logits/chosen": -0.13879060745239258, "logits/rejected": 0.14609341323375702, "logps/chosen": -1.764460563659668, "logps/rejected": -3.3435351848602295, "loss": 0.5527, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.764460563659668, "rewards/margins": 1.579074501991272, "rewards/rejected": -3.3435351848602295, "sft_loss": 1.7850478887557983, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 14.746188885325457, "learning_rate": 3.898586960137726e-07, "logits/chosen": -0.1557859629392624, "logits/rejected": -0.026083847507834435, "logps/chosen": -1.7631231546401978, "logps/rejected": -3.1784684658050537, "loss": 0.5437, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7631231546401978, "rewards/margins": 1.4153454303741455, "rewards/rejected": -3.1784684658050537, "sft_loss": 1.7551311254501343, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 7.0098680012744685, "learning_rate": 3.867220115072696e-07, "logits/chosen": -0.16939975321292877, "logits/rejected": -0.06898584216833115, "logps/chosen": -1.5736382007598877, "logps/rejected": -3.04780912399292, "loss": 0.5003, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.5736382007598877, "rewards/margins": 1.4741708040237427, "rewards/rejected": -3.04780912399292, "sft_loss": 1.6943461894989014, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 9.608896827672329, "learning_rate": 3.8359612802869367e-07, "logits/chosen": -0.2276763916015625, "logits/rejected": 0.05887297913432121, "logps/chosen": -1.7581098079681396, "logps/rejected": -3.37579607963562, "loss": 0.5385, "rewards/accuracies": 0.875, "rewards/chosen": -1.7581098079681396, "rewards/margins": 1.6176865100860596, "rewards/rejected": -3.37579607963562, "sft_loss": 1.784156084060669, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 11.627245261504164, "learning_rate": 3.8048107590537987e-07, "logits/chosen": -0.22332949936389923, "logits/rejected": 0.1258862465620041, "logps/chosen": -1.797519326210022, "logps/rejected": -3.420186996459961, "loss": 0.5251, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.797519326210022, "rewards/margins": 1.622667670249939, "rewards/rejected": -3.420186996459961, "sft_loss": 1.8621435165405273, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 10.671438733993377, "learning_rate": 3.773768853595774e-07, "logits/chosen": -0.30236560106277466, "logits/rejected": 0.11213777214288712, "logps/chosen": -1.7498910427093506, "logps/rejected": -3.347071409225464, "loss": 0.5433, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7498910427093506, "rewards/margins": 1.5971804857254028, "rewards/rejected": -3.347071409225464, "sft_loss": 1.8056799173355103, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 11.9378726825724, "learning_rate": 3.7428358650815706e-07, "logits/chosen": -0.20918190479278564, "logits/rejected": 0.14711831510066986, "logps/chosen": -1.8133995532989502, "logps/rejected": -3.165956735610962, "loss": 0.6067, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8133995532989502, "rewards/margins": 1.3525573015213013, "rewards/rejected": -3.165956735610962, "sft_loss": 1.847616195678711, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 18.760705928158977, "learning_rate": 3.712012093623172e-07, "logits/chosen": -0.13888953626155853, "logits/rejected": 0.09737597405910492, "logps/chosen": -1.838853120803833, "logps/rejected": -3.607891082763672, "loss": 0.546, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.838853120803833, "rewards/margins": 1.7690378427505493, "rewards/rejected": -3.607891082763672, "sft_loss": 1.9086978435516357, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 16.21585305836798, "learning_rate": 3.6812978382729524e-07, "logits/chosen": -0.27212634682655334, "logits/rejected": -0.08878588676452637, "logps/chosen": -1.7782297134399414, "logps/rejected": -3.4442741870880127, "loss": 0.5404, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.7782297134399414, "rewards/margins": 1.6660445928573608, "rewards/rejected": -3.4442741870880127, "sft_loss": 1.8388874530792236, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 10.750367317481379, "learning_rate": 3.650693397020744e-07, "logits/chosen": -0.26451048254966736, "logits/rejected": 0.0667090192437172, "logps/chosen": -1.7671161890029907, "logps/rejected": -3.529853343963623, "loss": 0.5453, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.7671161890029907, "rewards/margins": 1.7627372741699219, "rewards/rejected": -3.529853343963623, "sft_loss": 1.8724162578582764, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 12.298892852396053, "learning_rate": 3.6201990667909774e-07, "logits/chosen": -0.26518934965133667, "logits/rejected": 0.03896629437804222, "logps/chosen": -1.8639923334121704, "logps/rejected": -3.3952205181121826, "loss": 0.5865, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8639923334121704, "rewards/margins": 1.5312283039093018, "rewards/rejected": -3.3952205181121826, "sft_loss": 1.8920514583587646, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 10.507472476225365, "learning_rate": 3.589815143439772e-07, "logits/chosen": -0.09764394164085388, "logits/rejected": 0.029933521524071693, "logps/chosen": -1.705524206161499, "logps/rejected": -3.198183536529541, "loss": 0.5626, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.705524206161499, "rewards/margins": 1.492659330368042, "rewards/rejected": -3.198183536529541, "sft_loss": 1.768334984779358, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 14.25602671123383, "learning_rate": 3.559541921752091e-07, "logits/chosen": -0.214193195104599, "logits/rejected": 0.11325450241565704, "logps/chosen": -1.8728907108306885, "logps/rejected": -3.4027392864227295, "loss": 0.5658, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.8728907108306885, "rewards/margins": 1.5298488140106201, "rewards/rejected": -3.4027392864227295, "sft_loss": 1.9027717113494873, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 13.716539716632326, "learning_rate": 3.5293796954388565e-07, "logits/chosen": -0.29858314990997314, "logits/rejected": -0.0883362740278244, "logps/chosen": -1.6380106210708618, "logps/rejected": -2.9990859031677246, "loss": 0.562, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6380106210708618, "rewards/margins": 1.3610751628875732, "rewards/rejected": -2.9990859031677246, "sft_loss": 1.7269871234893799, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 12.231728070967494, "learning_rate": 3.499328757134129e-07, "logits/chosen": -0.08925610780715942, "logits/rejected": 0.019029032438993454, "logps/chosen": -1.823622465133667, "logps/rejected": -3.4843475818634033, "loss": 0.5306, "rewards/accuracies": 0.875, "rewards/chosen": -1.823622465133667, "rewards/margins": 1.6607252359390259, "rewards/rejected": -3.4843475818634033, "sft_loss": 1.8130333423614502, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 12.250659890107046, "learning_rate": 3.469389398392237e-07, "logits/chosen": -0.24888677895069122, "logits/rejected": 0.078799307346344, "logps/chosen": -1.7709062099456787, "logps/rejected": -3.557257890701294, "loss": 0.5098, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.7709062099456787, "rewards/margins": 1.7863517999649048, "rewards/rejected": -3.557257890701294, "sft_loss": 1.8200101852416992, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 12.730678276563896, "learning_rate": 3.4395619096849764e-07, "logits/chosen": -0.305867075920105, "logits/rejected": 0.02817920409142971, "logps/chosen": -1.8299095630645752, "logps/rejected": -3.3529536724090576, "loss": 0.5612, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.8299095630645752, "rewards/margins": 1.5230443477630615, "rewards/rejected": -3.3529536724090576, "sft_loss": 1.9097721576690674, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 12.339190127873952, "learning_rate": 3.409846580398766e-07, "logits/chosen": -0.12763968110084534, "logits/rejected": -0.09702740609645844, "logps/chosen": -1.7271099090576172, "logps/rejected": -3.2270724773406982, "loss": 0.5438, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7271099090576172, "rewards/margins": 1.4999626874923706, "rewards/rejected": -3.2270724773406982, "sft_loss": 1.781528115272522, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 17.315883991753594, "learning_rate": 3.380243698831869e-07, "logits/chosen": -0.25092118978500366, "logits/rejected": 0.0684286579489708, "logps/chosen": -1.7663710117340088, "logps/rejected": -3.2613213062286377, "loss": 0.5525, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7663710117340088, "rewards/margins": 1.494950532913208, "rewards/rejected": -3.2613213062286377, "sft_loss": 1.8157711029052734, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 14.133904776725748, "learning_rate": 3.350753552191563e-07, "logits/chosen": -0.24138808250427246, "logits/rejected": 0.011843997053802013, "logps/chosen": -1.7978967428207397, "logps/rejected": -3.3991634845733643, "loss": 0.5408, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7978967428207397, "rewards/margins": 1.601266860961914, "rewards/rejected": -3.3991634845733643, "sft_loss": 1.801038384437561, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 11.188254604136892, "learning_rate": 3.3213764265913915e-07, "logits/chosen": -0.15768598020076752, "logits/rejected": -0.040329623967409134, "logps/chosen": -1.6997654438018799, "logps/rejected": -3.045098066329956, "loss": 0.563, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.6997654438018799, "rewards/margins": 1.3453330993652344, "rewards/rejected": -3.045098066329956, "sft_loss": 1.768566370010376, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 10.000101912949614, "learning_rate": 3.292112607048343e-07, "logits/chosen": -0.20942172408103943, "logits/rejected": -0.018673386424779892, "logps/chosen": -1.7404597997665405, "logps/rejected": -3.327239513397217, "loss": 0.5168, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.7404597997665405, "rewards/margins": 1.5867795944213867, "rewards/rejected": -3.327239513397217, "sft_loss": 1.7671947479248047, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 14.781120515094285, "learning_rate": 3.262962377480136e-07, "logits/chosen": -0.27843308448791504, "logits/rejected": 0.012016067281365395, "logps/chosen": -1.7933326959609985, "logps/rejected": -3.5190646648406982, "loss": 0.5038, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.7933326959609985, "rewards/margins": 1.7257320880889893, "rewards/rejected": -3.5190646648406982, "sft_loss": 1.8551466464996338, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 17.379751864511018, "learning_rate": 3.233926020702414e-07, "logits/chosen": -0.2567359507083893, "logits/rejected": -0.10769607126712799, "logps/chosen": -1.7908289432525635, "logps/rejected": -2.9884819984436035, "loss": 0.6069, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7908289432525635, "rewards/margins": 1.19765305519104, "rewards/rejected": -2.9884819984436035, "sft_loss": 1.7830078601837158, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 9.92014656389634, "learning_rate": 3.205003818426047e-07, "logits/chosen": -0.10162917524576187, "logits/rejected": 0.063094362616539, "logps/chosen": -1.7472941875457764, "logps/rejected": -3.338911533355713, "loss": 0.5585, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7472941875457764, "rewards/margins": 1.5916169881820679, "rewards/rejected": -3.338911533355713, "sft_loss": 1.8631603717803955, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 15.432061735209999, "learning_rate": 3.1761960512543627e-07, "logits/chosen": -0.1377786248922348, "logits/rejected": 0.0018278755014762282, "logps/chosen": -1.71317458152771, "logps/rejected": -3.239314556121826, "loss": 0.5645, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.71317458152771, "rewards/margins": 1.5261398553848267, "rewards/rejected": -3.239314556121826, "sft_loss": 1.7747024297714233, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 10.205720482051749, "learning_rate": 3.147502998680447e-07, "logits/chosen": -0.13008762896060944, "logits/rejected": 0.029557768255472183, "logps/chosen": -1.7398678064346313, "logps/rejected": -3.333784818649292, "loss": 0.5602, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7398678064346313, "rewards/margins": 1.593916893005371, "rewards/rejected": -3.333784818649292, "sft_loss": 1.7981176376342773, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 11.087517393724978, "learning_rate": 3.11892493908442e-07, "logits/chosen": -0.22432152926921844, "logits/rejected": -0.08385895192623138, "logps/chosen": -1.6868541240692139, "logps/rejected": -3.2441565990448, "loss": 0.5468, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6868541240692139, "rewards/margins": 1.5573023557662964, "rewards/rejected": -3.2441565990448, "sft_loss": 1.6810327768325806, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 9.065330947296532, "learning_rate": 3.0904621497307437e-07, "logits/chosen": -0.18360400199890137, "logits/rejected": -0.06750941276550293, "logps/chosen": -1.8119789361953735, "logps/rejected": -3.222032070159912, "loss": 0.5997, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8119789361953735, "rewards/margins": 1.4100534915924072, "rewards/rejected": -3.222032070159912, "sft_loss": 1.913190245628357, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 12.271788183456376, "learning_rate": 3.062114906765522e-07, "logits/chosen": -0.3010988235473633, "logits/rejected": 0.06047710031270981, "logps/chosen": -1.761810302734375, "logps/rejected": -3.507551670074463, "loss": 0.5416, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.761810302734375, "rewards/margins": 1.745741605758667, "rewards/rejected": -3.507551670074463, "sft_loss": 1.7494663000106812, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 14.433516942336327, "learning_rate": 3.0338834852138346e-07, "logits/chosen": -0.15207983553409576, "logits/rejected": 0.00596159091219306, "logps/chosen": -1.8344218730926514, "logps/rejected": -3.4356167316436768, "loss": 0.5405, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8344218730926514, "rewards/margins": 1.6011947393417358, "rewards/rejected": -3.4356167316436768, "sft_loss": 1.7789312601089478, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 13.482372037361836, "learning_rate": 3.0057681589770526e-07, "logits/chosen": -0.16331283748149872, "logits/rejected": 0.10025990009307861, "logps/chosen": -1.831169843673706, "logps/rejected": -3.5069286823272705, "loss": 0.5554, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.831169843673706, "rewards/margins": 1.675758719444275, "rewards/rejected": -3.5069286823272705, "sft_loss": 1.8912452459335327, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 12.934023132184983, "learning_rate": 2.9777692008301993e-07, "logits/chosen": -0.06551705300807953, "logits/rejected": 0.02953300252556801, "logps/chosen": -1.7379716634750366, "logps/rejected": -3.3578503131866455, "loss": 0.5195, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.7379716634750366, "rewards/margins": 1.6198784112930298, "rewards/rejected": -3.3578503131866455, "sft_loss": 1.787366271018982, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 15.069769362589748, "learning_rate": 2.949886882419284e-07, "logits/chosen": -0.1420452892780304, "logits/rejected": -0.06317319720983505, "logps/chosen": -1.7161645889282227, "logps/rejected": -3.232700824737549, "loss": 0.5422, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7161645889282227, "rewards/margins": 1.5165363550186157, "rewards/rejected": -3.232700824737549, "sft_loss": 1.7947757244110107, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 11.340771528217207, "learning_rate": 2.92212147425869e-07, "logits/chosen": -0.14552152156829834, "logits/rejected": 0.0813545510172844, "logps/chosen": -1.8014802932739258, "logps/rejected": -3.518508195877075, "loss": 0.5528, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.8014802932739258, "rewards/margins": 1.717028260231018, "rewards/rejected": -3.518508195877075, "sft_loss": 1.9085992574691772, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 12.494784258442676, "learning_rate": 2.894473245728518e-07, "logits/chosen": -0.2370075285434723, "logits/rejected": -0.006445932202041149, "logps/chosen": -1.718737244606018, "logps/rejected": -3.3767809867858887, "loss": 0.5538, "rewards/accuracies": 0.8125, "rewards/chosen": -1.718737244606018, "rewards/margins": 1.6580438613891602, "rewards/rejected": -3.3767809867858887, "sft_loss": 1.814737319946289, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 21.774382492352622, "learning_rate": 2.866942465072014e-07, "logits/chosen": -0.2321983128786087, "logits/rejected": -0.03561241179704666, "logps/chosen": -1.802708387374878, "logps/rejected": -3.5632636547088623, "loss": 0.5672, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.802708387374878, "rewards/margins": 1.7605549097061157, "rewards/rejected": -3.5632636547088623, "sft_loss": 1.8140722513198853, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 10.109533599048547, "learning_rate": 2.839529399392924e-07, "logits/chosen": -0.22016914188861847, "logits/rejected": 0.14298439025878906, "logps/chosen": -1.8716926574707031, "logps/rejected": -3.670722484588623, "loss": 0.5386, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.8716926574707031, "rewards/margins": 1.799030065536499, "rewards/rejected": -3.670722484588623, "sft_loss": 1.953478217124939, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 9.919476961497255, "learning_rate": 2.812234314652937e-07, "logits/chosen": -0.18086175620555878, "logits/rejected": 0.08491306006908417, "logps/chosen": -1.823604941368103, "logps/rejected": -3.513345241546631, "loss": 0.5581, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.823604941368103, "rewards/margins": 1.689740538597107, "rewards/rejected": -3.513345241546631, "sft_loss": 1.879625678062439, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 13.542513430482426, "learning_rate": 2.785057475669084e-07, "logits/chosen": -0.2250836342573166, "logits/rejected": 0.0073054940439760685, "logps/chosen": -1.771472692489624, "logps/rejected": -3.573692798614502, "loss": 0.5247, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.771472692489624, "rewards/margins": 1.802220106124878, "rewards/rejected": -3.573692798614502, "sft_loss": 1.8163973093032837, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 13.091639134361028, "learning_rate": 2.75799914611117e-07, "logits/chosen": -0.16010549664497375, "logits/rejected": 0.07752474397420883, "logps/chosen": -1.8313506841659546, "logps/rejected": -3.6223537921905518, "loss": 0.5446, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8313506841659546, "rewards/margins": 1.7910032272338867, "rewards/rejected": -3.6223537921905518, "sft_loss": 1.8761584758758545, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 9.93265718865091, "learning_rate": 2.7310595884992354e-07, "logits/chosen": -0.12645861506462097, "logits/rejected": 0.18549086153507233, "logps/chosen": -1.6416807174682617, "logps/rejected": -3.2978649139404297, "loss": 0.5107, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.6416807174682617, "rewards/margins": 1.656184196472168, "rewards/rejected": -3.2978649139404297, "sft_loss": 1.7991886138916016, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 11.044243573023872, "learning_rate": 2.7042390642009805e-07, "logits/chosen": -0.22041960060596466, "logits/rejected": -0.17886695265769958, "logps/chosen": -1.7223641872406006, "logps/rejected": -3.300487995147705, "loss": 0.5636, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7223641872406006, "rewards/margins": 1.578123927116394, "rewards/rejected": -3.300487995147705, "sft_loss": 1.7671864032745361, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 12.385348094564502, "learning_rate": 2.6775378334292543e-07, "logits/chosen": -0.054296769201755524, "logits/rejected": 0.0722646489739418, "logps/chosen": -1.7313636541366577, "logps/rejected": -3.252239942550659, "loss": 0.5358, "rewards/accuracies": 0.90625, "rewards/chosen": -1.7313636541366577, "rewards/margins": 1.520876169204712, "rewards/rejected": -3.252239942550659, "sft_loss": 1.797353982925415, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 14.735766602782922, "learning_rate": 2.650956155239512e-07, "logits/chosen": -0.10592559725046158, "logits/rejected": 0.16242524981498718, "logps/chosen": -1.76133131980896, "logps/rejected": -3.5244193077087402, "loss": 0.5086, "rewards/accuracies": 0.84375, "rewards/chosen": -1.76133131980896, "rewards/margins": 1.7630879878997803, "rewards/rejected": -3.5244193077087402, "sft_loss": 1.7872645854949951, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 15.34427977343139, "learning_rate": 2.6244942875273093e-07, "logits/chosen": -0.06889469921588898, "logits/rejected": 0.09271882474422455, "logps/chosen": -1.869799017906189, "logps/rejected": -3.4388561248779297, "loss": 0.5349, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.869799017906189, "rewards/margins": 1.5690571069717407, "rewards/rejected": -3.4388561248779297, "sft_loss": 1.8109409809112549, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 10.461582072034766, "learning_rate": 2.59815248702581e-07, "logits/chosen": -0.1550874561071396, "logits/rejected": 0.06648962199687958, "logps/chosen": -1.7301225662231445, "logps/rejected": -3.288156509399414, "loss": 0.5342, "rewards/accuracies": 0.875, "rewards/chosen": -1.7301225662231445, "rewards/margins": 1.5580341815948486, "rewards/rejected": -3.288156509399414, "sft_loss": 1.8114830255508423, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 9.686256783274853, "learning_rate": 2.5719310093032695e-07, "logits/chosen": -0.25174325704574585, "logits/rejected": 0.12775930762290955, "logps/chosen": -1.801027536392212, "logps/rejected": -3.4213192462921143, "loss": 0.5418, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.801027536392212, "rewards/margins": 1.6202919483184814, "rewards/rejected": -3.4213192462921143, "sft_loss": 1.7672901153564453, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 10.986970716143642, "learning_rate": 2.5458301087605876e-07, "logits/chosen": -0.19087447226047516, "logits/rejected": 0.0413995161652565, "logps/chosen": -1.8017337322235107, "logps/rejected": -3.272310256958008, "loss": 0.5838, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8017337322235107, "rewards/margins": 1.470576286315918, "rewards/rejected": -3.272310256958008, "sft_loss": 1.8942054510116577, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 10.46111832932533, "learning_rate": 2.5198500386288083e-07, "logits/chosen": -0.12169595062732697, "logits/rejected": 0.045823872089385986, "logps/chosen": -1.7901885509490967, "logps/rejected": -3.4956653118133545, "loss": 0.5057, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.7901885509490967, "rewards/margins": 1.705476999282837, "rewards/rejected": -3.4956653118133545, "sft_loss": 1.8139406442642212, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 15.651885381161886, "learning_rate": 2.493991050966694e-07, "logits/chosen": -0.18022406101226807, "logits/rejected": -0.04743504524230957, "logps/chosen": -1.8339494466781616, "logps/rejected": -3.3705623149871826, "loss": 0.5623, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.8339494466781616, "rewards/margins": 1.5366131067276, "rewards/rejected": -3.3705623149871826, "sft_loss": 1.8706283569335938, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 14.293280222901767, "learning_rate": 2.4682533966582494e-07, "logits/chosen": -0.18216542899608612, "logits/rejected": 0.03202268108725548, "logps/chosen": -1.7299884557724, "logps/rejected": -3.0353915691375732, "loss": 0.5769, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7299884557724, "rewards/margins": 1.3054029941558838, "rewards/rejected": -3.0353915691375732, "sft_loss": 1.789258599281311, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 12.750199691611652, "learning_rate": 2.442637325410316e-07, "logits/chosen": -0.057832587510347366, "logits/rejected": 0.2335827797651291, "logps/chosen": -1.7248417139053345, "logps/rejected": -3.3812613487243652, "loss": 0.5565, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7248417139053345, "rewards/margins": 1.6564195156097412, "rewards/rejected": -3.3812613487243652, "sft_loss": 1.746360421180725, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 18.29241678857937, "learning_rate": 2.417143085750122e-07, "logits/chosen": -0.0375925675034523, "logits/rejected": 0.11547158658504486, "logps/chosen": -1.7524213790893555, "logps/rejected": -3.4146087169647217, "loss": 0.5355, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7524213790893555, "rewards/margins": 1.6621872186660767, "rewards/rejected": -3.4146087169647217, "sft_loss": 1.798270583152771, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 10.692978286152742, "learning_rate": 2.3917709250228994e-07, "logits/chosen": -0.1646510511636734, "logits/rejected": 0.18285346031188965, "logps/chosen": -1.7572791576385498, "logps/rejected": -3.271498203277588, "loss": 0.534, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7572791576385498, "rewards/margins": 1.5142189264297485, "rewards/rejected": -3.271498203277588, "sft_loss": 1.7719749212265015, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 14.719809421351123, "learning_rate": 2.3665210893894557e-07, "logits/chosen": -0.039440952241420746, "logits/rejected": 0.04918104037642479, "logps/chosen": -1.725327491760254, "logps/rejected": -3.278855085372925, "loss": 0.5536, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.725327491760254, "rewards/margins": 1.5535273551940918, "rewards/rejected": -3.278855085372925, "sft_loss": 1.7282207012176514, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 10.885379905130568, "learning_rate": 2.3413938238238157e-07, "logits/chosen": -0.09351601451635361, "logits/rejected": 0.19373974204063416, "logps/chosen": -1.8117258548736572, "logps/rejected": -3.4790234565734863, "loss": 0.556, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.8117258548736572, "rewards/margins": 1.6672979593276978, "rewards/rejected": -3.4790234565734863, "sft_loss": 1.8795970678329468, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 10.498041294231749, "learning_rate": 2.316389372110812e-07, "logits/chosen": -0.2118692845106125, "logits/rejected": 0.007659897208213806, "logps/chosen": -1.747856855392456, "logps/rejected": -3.2259974479675293, "loss": 0.5612, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.747856855392456, "rewards/margins": 1.4781407117843628, "rewards/rejected": -3.2259974479675293, "sft_loss": 1.8180797100067139, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 14.091805082180102, "learning_rate": 2.2915079768437514e-07, "logits/chosen": -0.06196912005543709, "logits/rejected": 0.025976702570915222, "logps/chosen": -1.8319848775863647, "logps/rejected": -3.4811851978302, "loss": 0.54, "rewards/accuracies": 0.875, "rewards/chosen": -1.8319848775863647, "rewards/margins": 1.649200201034546, "rewards/rejected": -3.4811851978302, "sft_loss": 1.812852144241333, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 14.430961778590698, "learning_rate": 2.2667498794220326e-07, "logits/chosen": -0.17927943170070648, "logits/rejected": 0.1337457150220871, "logps/chosen": -1.8328558206558228, "logps/rejected": -3.450277328491211, "loss": 0.5479, "rewards/accuracies": 0.84375, "rewards/chosen": -1.8328558206558228, "rewards/margins": 1.6174217462539673, "rewards/rejected": -3.450277328491211, "sft_loss": 1.8390781879425049, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 11.75329047632989, "learning_rate": 2.2421153200488332e-07, "logits/chosen": -0.080213263630867, "logits/rejected": -0.05093265697360039, "logps/chosen": -1.8332271575927734, "logps/rejected": -3.5902411937713623, "loss": 0.5071, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.8332271575927734, "rewards/margins": 1.7570136785507202, "rewards/rejected": -3.5902411937713623, "sft_loss": 1.8736753463745117, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 13.470083247872166, "learning_rate": 2.217604537728749e-07, "logits/chosen": -0.1351408064365387, "logits/rejected": 0.045610688626766205, "logps/chosen": -1.6412384510040283, "logps/rejected": -3.2661595344543457, "loss": 0.4879, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.6412384510040283, "rewards/margins": 1.6249208450317383, "rewards/rejected": -3.2661595344543457, "sft_loss": 1.7293357849121094, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 9.222599257137244, "learning_rate": 2.1932177702655053e-07, "logits/chosen": -0.2029998004436493, "logits/rejected": -0.11901791393756866, "logps/chosen": -1.8168365955352783, "logps/rejected": -3.4140625, "loss": 0.5476, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8168365955352783, "rewards/margins": 1.5972263813018799, "rewards/rejected": -3.4140625, "sft_loss": 1.860943078994751, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 11.158699914225073, "learning_rate": 2.1689552542596232e-07, "logits/chosen": -0.08220269531011581, "logits/rejected": 0.16050629317760468, "logps/chosen": -1.742254614830017, "logps/rejected": -3.5307483673095703, "loss": 0.4996, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.742254614830017, "rewards/margins": 1.7884941101074219, "rewards/rejected": -3.5307483673095703, "sft_loss": 1.8253253698349, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 10.093803537845512, "learning_rate": 2.1448172251061338e-07, "logits/chosen": 0.013688882812857628, "logits/rejected": -0.09023362398147583, "logps/chosen": -1.7495476007461548, "logps/rejected": -3.1490283012390137, "loss": 0.5315, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7495476007461548, "rewards/margins": 1.3994805812835693, "rewards/rejected": -3.1490283012390137, "sft_loss": 1.7908941507339478, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 11.184649941495632, "learning_rate": 2.1208039169923122e-07, "logits/chosen": -0.17796917259693146, "logits/rejected": 0.08611693233251572, "logps/chosen": -1.880004644393921, "logps/rejected": -3.462754487991333, "loss": 0.5436, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.880004644393921, "rewards/margins": 1.582749605178833, "rewards/rejected": -3.462754487991333, "sft_loss": 1.9490007162094116, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 13.925243624028903, "learning_rate": 2.096915562895369e-07, "logits/chosen": -0.12639853358268738, "logits/rejected": -0.05879003927111626, "logps/chosen": -1.8781654834747314, "logps/rejected": -3.5193824768066406, "loss": 0.5751, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.8781654834747314, "rewards/margins": 1.6412169933319092, "rewards/rejected": -3.5193824768066406, "sft_loss": 2.001239061355591, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 11.151390897624402, "learning_rate": 2.07315239458023e-07, "logits/chosen": -0.11989846080541611, "logits/rejected": 0.3027260899543762, "logps/chosen": -1.857408881187439, "logps/rejected": -3.7410550117492676, "loss": 0.4824, "rewards/accuracies": 0.9375, "rewards/chosen": -1.857408881187439, "rewards/margins": 1.88364577293396, "rewards/rejected": -3.7410550117492676, "sft_loss": 1.8817813396453857, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 12.167077440955572, "learning_rate": 2.0495146425972487e-07, "logits/chosen": -0.22354164719581604, "logits/rejected": 0.07373027503490448, "logps/chosen": -1.765608549118042, "logps/rejected": -3.5321717262268066, "loss": 0.5464, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.765608549118042, "rewards/margins": 1.766563057899475, "rewards/rejected": -3.5321717262268066, "sft_loss": 1.8596305847167969, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 13.297410085708984, "learning_rate": 2.0260025362800078e-07, "logits/chosen": -0.2584924101829529, "logits/rejected": -0.12692752480506897, "logps/chosen": -1.812424898147583, "logps/rejected": -3.632875442504883, "loss": 0.4927, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.812424898147583, "rewards/margins": 1.820450782775879, "rewards/rejected": -3.632875442504883, "sft_loss": 1.8927667140960693, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 14.149462733267194, "learning_rate": 2.002616303743059e-07, "logits/chosen": -0.23086294531822205, "logits/rejected": 0.05471482127904892, "logps/chosen": -1.988516092300415, "logps/rejected": -3.7733561992645264, "loss": 0.5631, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.988516092300415, "rewards/margins": 1.7848399877548218, "rewards/rejected": -3.7733561992645264, "sft_loss": 2.062723398208618, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 20.085394294617718, "learning_rate": 1.979356171879738e-07, "logits/chosen": -0.13205042481422424, "logits/rejected": 0.0894799530506134, "logps/chosen": -1.877374291419983, "logps/rejected": -3.731405735015869, "loss": 0.5138, "rewards/accuracies": 0.875, "rewards/chosen": -1.877374291419983, "rewards/margins": 1.8540315628051758, "rewards/rejected": -3.731405735015869, "sft_loss": 1.92086923122406, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 16.78769911346286, "learning_rate": 1.9562223663599399e-07, "logits/chosen": -0.07414297759532928, "logits/rejected": 0.09883741289377213, "logps/chosen": -1.8357845544815063, "logps/rejected": -3.586291790008545, "loss": 0.5352, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.8357845544815063, "rewards/margins": 1.7505077123641968, "rewards/rejected": -3.586291790008545, "sft_loss": 1.860817313194275, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 12.369865830612776, "learning_rate": 1.9332151116279557e-07, "logits/chosen": -0.18610504269599915, "logits/rejected": -0.0502205491065979, "logps/chosen": -1.7904930114746094, "logps/rejected": -3.3864052295684814, "loss": 0.5387, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7904930114746094, "rewards/margins": 1.5959125757217407, "rewards/rejected": -3.3864052295684814, "sft_loss": 1.866620659828186, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 14.31104044382298, "learning_rate": 1.9103346309002623e-07, "logits/chosen": -0.15936878323554993, "logits/rejected": -0.06936420500278473, "logps/chosen": -1.8094561100006104, "logps/rejected": -3.239793062210083, "loss": 0.5843, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8094561100006104, "rewards/margins": 1.4303371906280518, "rewards/rejected": -3.239793062210083, "sft_loss": 1.8103708028793335, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 11.573837494960328, "learning_rate": 1.887581146163394e-07, "logits/chosen": -0.21960671246051788, "logits/rejected": -0.011276873759925365, "logps/chosen": -1.8357484340667725, "logps/rejected": -3.542323350906372, "loss": 0.5809, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8357484340667725, "rewards/margins": 1.7065750360488892, "rewards/rejected": -3.542323350906372, "sft_loss": 1.8496434688568115, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 13.238059687502597, "learning_rate": 1.8649548781717506e-07, "logits/chosen": -0.09515713155269623, "logits/rejected": 0.08226939290761948, "logps/chosen": -1.8203575611114502, "logps/rejected": -3.3756203651428223, "loss": 0.5381, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.8203575611114502, "rewards/margins": 1.555262804031372, "rewards/rejected": -3.3756203651428223, "sft_loss": 1.8084625005722046, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 9.594684997285926, "learning_rate": 1.8424560464454891e-07, "logits/chosen": -0.2160584032535553, "logits/rejected": -0.02173597738146782, "logps/chosen": -1.7369788885116577, "logps/rejected": -3.13492488861084, "loss": 0.5581, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7369788885116577, "rewards/margins": 1.3979461193084717, "rewards/rejected": -3.13492488861084, "sft_loss": 1.8426496982574463, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": 0.5046150088310242, "eval_logits/rejected": 0.6573245525360107, "eval_logps/chosen": -2.0441954135894775, "eval_logps/rejected": -3.071851968765259, "eval_loss": 0.7417393922805786, "eval_rewards/accuracies": 0.7017804384231567, "eval_rewards/chosen": -2.0441954135894775, "eval_rewards/margins": 1.0276561975479126, "eval_rewards/rejected": -3.071851968765259, "eval_runtime": 47.8029, "eval_samples_per_second": 28.136, "eval_sft_loss": 1.9637134075164795, "eval_steps_per_second": 7.05, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 12.447563754874988, "learning_rate": 1.820084869268369e-07, "logits/chosen": -0.21850493550300598, "logits/rejected": -0.04778672754764557, "logps/chosen": -1.7985130548477173, "logps/rejected": -3.3076224327087402, "loss": 0.562, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.7985130548477173, "rewards/margins": 1.5091097354888916, "rewards/rejected": -3.3076224327087402, "sft_loss": 1.8247474431991577, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 13.0918948619155, "learning_rate": 1.7978415636856571e-07, "logits/chosen": -0.12160871177911758, "logits/rejected": 0.03960564360022545, "logps/chosen": -1.7965052127838135, "logps/rejected": -3.361992359161377, "loss": 0.5683, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7965052127838135, "rewards/margins": 1.5654871463775635, "rewards/rejected": -3.361992359161377, "sft_loss": 1.8337568044662476, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 13.7797932165533, "learning_rate": 1.7757263455019906e-07, "logits/chosen": -0.17800372838974, "logits/rejected": 0.06227899715304375, "logps/chosen": -1.6353908777236938, "logps/rejected": -3.2979979515075684, "loss": 0.5476, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6353908777236938, "rewards/margins": 1.6626074314117432, "rewards/rejected": -3.2979979515075684, "sft_loss": 1.723488211631775, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 18.837965669479228, "learning_rate": 1.7537394292793245e-07, "logits/chosen": -0.09702011197805405, "logits/rejected": 0.04160100966691971, "logps/chosen": -1.8251625299453735, "logps/rejected": -3.1852543354034424, "loss": 0.5801, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8251625299453735, "rewards/margins": 1.3600914478302002, "rewards/rejected": -3.1852543354034424, "sft_loss": 1.8348585367202759, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 10.661240642290903, "learning_rate": 1.731881028334808e-07, "logits/chosen": -0.14085690677165985, "logits/rejected": 0.08010586351156235, "logps/chosen": -1.7059447765350342, "logps/rejected": -3.2035980224609375, "loss": 0.5339, "rewards/accuracies": 0.875, "rewards/chosen": -1.7059447765350342, "rewards/margins": 1.4976532459259033, "rewards/rejected": -3.2035980224609375, "sft_loss": 1.7304388284683228, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 9.89564955553788, "learning_rate": 1.7101513547387487e-07, "logits/chosen": -0.19714686274528503, "logits/rejected": 0.04681064933538437, "logps/chosen": -1.7345222234725952, "logps/rejected": -3.2551662921905518, "loss": 0.5284, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.7345222234725952, "rewards/margins": 1.5206438302993774, "rewards/rejected": -3.2551662921905518, "sft_loss": 1.7617794275283813, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 20.774074596573787, "learning_rate": 1.6885506193125306e-07, "logits/chosen": -0.309567391872406, "logits/rejected": -0.00635856669396162, "logps/chosen": -1.7985738515853882, "logps/rejected": -3.5609755516052246, "loss": 0.5236, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7985738515853882, "rewards/margins": 1.7624021768569946, "rewards/rejected": -3.5609755516052246, "sft_loss": 1.8604736328125, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 13.297055407552948, "learning_rate": 1.667079031626591e-07, "logits/chosen": -0.22957918047904968, "logits/rejected": 0.125446155667305, "logps/chosen": -1.7423770427703857, "logps/rejected": -3.4894371032714844, "loss": 0.51, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7423770427703857, "rewards/margins": 1.7470604181289673, "rewards/rejected": -3.4894371032714844, "sft_loss": 1.779083013534546, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 11.427636309390813, "learning_rate": 1.6457367999983568e-07, "logits/chosen": -0.15671579539775848, "logits/rejected": -0.06081641837954521, "logps/chosen": -1.7776477336883545, "logps/rejected": -3.3175289630889893, "loss": 0.5544, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7776477336883545, "rewards/margins": 1.5398811101913452, "rewards/rejected": -3.3175289630889893, "sft_loss": 1.8607282638549805, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 12.595993856207159, "learning_rate": 1.6245241314902604e-07, "logits/chosen": -0.32584601640701294, "logits/rejected": -0.05513492971658707, "logps/chosen": -1.7751858234405518, "logps/rejected": -3.486859083175659, "loss": 0.5335, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7751858234405518, "rewards/margins": 1.7116730213165283, "rewards/rejected": -3.486859083175659, "sft_loss": 1.793172836303711, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 12.650737116268983, "learning_rate": 1.6034412319077008e-07, "logits/chosen": -0.08883605897426605, "logits/rejected": 0.16650724411010742, "logps/chosen": -1.7191474437713623, "logps/rejected": -3.4780402183532715, "loss": 0.5493, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7191474437713623, "rewards/margins": 1.7588927745819092, "rewards/rejected": -3.4780402183532715, "sft_loss": 1.8166354894638062, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 10.588617379104743, "learning_rate": 1.582488305797068e-07, "logits/chosen": -0.13550075888633728, "logits/rejected": 0.012206335552036762, "logps/chosen": -1.6531245708465576, "logps/rejected": -3.258253574371338, "loss": 0.5075, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.6531245708465576, "rewards/margins": 1.6051290035247803, "rewards/rejected": -3.258253574371338, "sft_loss": 1.7562519311904907, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 9.513595547076909, "learning_rate": 1.5616655564437354e-07, "logits/chosen": -0.2985331416130066, "logits/rejected": -0.12871314585208893, "logps/chosen": -1.783439040184021, "logps/rejected": -3.5234837532043457, "loss": 0.5195, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.783439040184021, "rewards/margins": 1.7400442361831665, "rewards/rejected": -3.5234837532043457, "sft_loss": 1.803736925125122, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 10.722593488383868, "learning_rate": 1.5409731858701154e-07, "logits/chosen": -0.08731357753276825, "logits/rejected": 0.08228771388530731, "logps/chosen": -1.6720569133758545, "logps/rejected": -3.4969024658203125, "loss": 0.4825, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.6720569133758545, "rewards/margins": 1.8248456716537476, "rewards/rejected": -3.4969024658203125, "sft_loss": 1.6960939168930054, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 14.57535968621479, "learning_rate": 1.5204113948336717e-07, "logits/chosen": 0.012193548493087292, "logits/rejected": 0.1711004674434662, "logps/chosen": -1.70675528049469, "logps/rejected": -3.585901975631714, "loss": 0.51, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.70675528049469, "rewards/margins": 1.879146933555603, "rewards/rejected": -3.585901975631714, "sft_loss": 1.8017257452011108, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 9.169941429507046, "learning_rate": 1.499980382824997e-07, "logits/chosen": -0.07882848381996155, "logits/rejected": 0.1298879235982895, "logps/chosen": -1.7168487310409546, "logps/rejected": -3.5460994243621826, "loss": 0.5374, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7168487310409546, "rewards/margins": 1.8292505741119385, "rewards/rejected": -3.5460994243621826, "sft_loss": 1.80367910861969, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 13.128022621245504, "learning_rate": 1.479680348065855e-07, "logits/chosen": -0.06857812404632568, "logits/rejected": 0.012163696810603142, "logps/chosen": -1.852543592453003, "logps/rejected": -3.7211203575134277, "loss": 0.554, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.852543592453003, "rewards/margins": 1.868577241897583, "rewards/rejected": -3.7211203575134277, "sft_loss": 1.9871208667755127, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 11.92261447993522, "learning_rate": 1.4595114875072762e-07, "logits/chosen": -0.3111642003059387, "logits/rejected": -0.004455783870071173, "logps/chosen": -1.769470453262329, "logps/rejected": -3.4464504718780518, "loss": 0.554, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.769470453262329, "rewards/margins": 1.6769797801971436, "rewards/rejected": -3.4464504718780518, "sft_loss": 1.8530012369155884, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 12.377104674604613, "learning_rate": 1.4394739968276293e-07, "logits/chosen": -0.17811310291290283, "logits/rejected": -0.06515610218048096, "logps/chosen": -1.8002650737762451, "logps/rejected": -3.0800435543060303, "loss": 0.6032, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8002650737762451, "rewards/margins": 1.2797784805297852, "rewards/rejected": -3.0800435543060303, "sft_loss": 1.8826320171356201, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 9.965069281860176, "learning_rate": 1.4195680704307405e-07, "logits/chosen": -0.06343531608581543, "logits/rejected": 0.16841797530651093, "logps/chosen": -1.6909034252166748, "logps/rejected": -3.36090087890625, "loss": 0.5128, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.6909034252166748, "rewards/margins": 1.6699978113174438, "rewards/rejected": -3.36090087890625, "sft_loss": 1.759202241897583, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 11.377090498412477, "learning_rate": 1.3997939014439926e-07, "logits/chosen": -0.0794590562582016, "logits/rejected": 0.16901133954524994, "logps/chosen": -1.8307081460952759, "logps/rejected": -3.487328052520752, "loss": 0.5235, "rewards/accuracies": 0.875, "rewards/chosen": -1.8307081460952759, "rewards/margins": 1.6566200256347656, "rewards/rejected": -3.487328052520752, "sft_loss": 1.9024460315704346, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 11.126550537311987, "learning_rate": 1.380151681716465e-07, "logits/chosen": -0.10807999223470688, "logits/rejected": -0.16313369572162628, "logps/chosen": -1.82060968875885, "logps/rejected": -3.688711166381836, "loss": 0.5498, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.82060968875885, "rewards/margins": 1.868101716041565, "rewards/rejected": -3.688711166381836, "sft_loss": 1.8633701801300049, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 13.851309715159937, "learning_rate": 1.3606416018170502e-07, "logits/chosen": -0.1272238790988922, "logits/rejected": 0.0939817801117897, "logps/chosen": -1.6587791442871094, "logps/rejected": -3.3085055351257324, "loss": 0.5353, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.6587791442871094, "rewards/margins": 1.6497268676757812, "rewards/rejected": -3.3085055351257324, "sft_loss": 1.749477744102478, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 11.026766281995744, "learning_rate": 1.3412638510326397e-07, "logits/chosen": -0.12887075543403625, "logits/rejected": 0.02745388075709343, "logps/chosen": -1.7454307079315186, "logps/rejected": -3.394977569580078, "loss": 0.5568, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7454307079315186, "rewards/margins": 1.6495468616485596, "rewards/rejected": -3.394977569580078, "sft_loss": 1.8268448114395142, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 14.997638937730077, "learning_rate": 1.3220186173662462e-07, "logits/chosen": -0.3330609202384949, "logits/rejected": 0.06650768965482712, "logps/chosen": -1.735339879989624, "logps/rejected": -3.4855971336364746, "loss": 0.537, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.735339879989624, "rewards/margins": 1.7502572536468506, "rewards/rejected": -3.4855971336364746, "sft_loss": 1.8361396789550781, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 13.814316314857964, "learning_rate": 1.30290608753522e-07, "logits/chosen": -0.1492258906364441, "logits/rejected": 0.14451850950717926, "logps/chosen": -1.8341033458709717, "logps/rejected": -3.687492847442627, "loss": 0.5149, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8341033458709717, "rewards/margins": 1.8533893823623657, "rewards/rejected": -3.687492847442627, "sft_loss": 1.835182547569275, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 17.351310846159475, "learning_rate": 1.2839264469694039e-07, "logits/chosen": -0.22598882019519806, "logits/rejected": 0.0706649050116539, "logps/chosen": -1.8030054569244385, "logps/rejected": -3.439702272415161, "loss": 0.5703, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8030054569244385, "rewards/margins": 1.6366968154907227, "rewards/rejected": -3.439702272415161, "sft_loss": 1.8270823955535889, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 13.03043262410308, "learning_rate": 1.2650798798093577e-07, "logits/chosen": -0.17963027954101562, "logits/rejected": -0.021970821544528008, "logps/chosen": -1.7476119995117188, "logps/rejected": -3.096475601196289, "loss": 0.5663, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7476119995117188, "rewards/margins": 1.3488636016845703, "rewards/rejected": -3.096475601196289, "sft_loss": 1.794965147972107, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 10.868616941883515, "learning_rate": 1.2463665689045533e-07, "logits/chosen": -0.15986979007720947, "logits/rejected": 0.1507270336151123, "logps/chosen": -1.740025281906128, "logps/rejected": -3.535895586013794, "loss": 0.5162, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.740025281906128, "rewards/margins": 1.7958704233169556, "rewards/rejected": -3.535895586013794, "sft_loss": 1.7941840887069702, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 12.772663086577158, "learning_rate": 1.2277866958116207e-07, "logits/chosen": -0.1350887268781662, "logits/rejected": 0.13923409581184387, "logps/chosen": -1.8032842874526978, "logps/rejected": -3.202432632446289, "loss": 0.5713, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.8032842874526978, "rewards/margins": 1.3991484642028809, "rewards/rejected": -3.202432632446289, "sft_loss": 1.805890679359436, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 10.774743614579105, "learning_rate": 1.2093404407925668e-07, "logits/chosen": -0.1534949690103531, "logits/rejected": -0.10646752268075943, "logps/chosen": -1.8330596685409546, "logps/rejected": -3.3806934356689453, "loss": 0.5521, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.8330596685409546, "rewards/margins": 1.5476337671279907, "rewards/rejected": -3.3806934356689453, "sft_loss": 1.9304492473602295, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 12.074809040302313, "learning_rate": 1.1910279828130405e-07, "logits/chosen": -0.09574166685342789, "logits/rejected": 0.05776409059762955, "logps/chosen": -1.7001692056655884, "logps/rejected": -3.1842257976531982, "loss": 0.5503, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7001692056655884, "rewards/margins": 1.484057068824768, "rewards/rejected": -3.1842257976531982, "sft_loss": 1.759682059288025, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 16.580193681475954, "learning_rate": 1.1728494995405876e-07, "logits/chosen": -0.22148558497428894, "logits/rejected": 0.0023852705489844084, "logps/chosen": -1.6515419483184814, "logps/rejected": -3.407543659210205, "loss": 0.5244, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6515419483184814, "rewards/margins": 1.7560014724731445, "rewards/rejected": -3.407543659210205, "sft_loss": 1.7370100021362305, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 18.67151226763764, "learning_rate": 1.1548051673429366e-07, "logits/chosen": -0.07205932587385178, "logits/rejected": 0.022912006825208664, "logps/chosen": -1.6140819787979126, "logps/rejected": -3.376591444015503, "loss": 0.5139, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.6140819787979126, "rewards/margins": 1.7625093460083008, "rewards/rejected": -3.376591444015503, "sft_loss": 1.646588683128357, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 11.77285577417867, "learning_rate": 1.136895161286271e-07, "logits/chosen": -0.09294477105140686, "logits/rejected": -0.0116586210206151, "logps/chosen": -1.8401886224746704, "logps/rejected": -3.378535509109497, "loss": 0.5324, "rewards/accuracies": 0.875, "rewards/chosen": -1.8401886224746704, "rewards/margins": 1.538346767425537, "rewards/rejected": -3.378535509109497, "sft_loss": 1.8359954357147217, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 12.273981942089009, "learning_rate": 1.1191196551335547e-07, "logits/chosen": 0.015403158962726593, "logits/rejected": 0.08249323815107346, "logps/chosen": -1.9144912958145142, "logps/rejected": -3.440058946609497, "loss": 0.5829, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.9144912958145142, "rewards/margins": 1.525567650794983, "rewards/rejected": -3.440058946609497, "sft_loss": 1.866431474685669, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 13.011751079955705, "learning_rate": 1.1014788213428206e-07, "logits/chosen": -0.10495848953723907, "logits/rejected": 0.18491685390472412, "logps/chosen": -1.7197036743164062, "logps/rejected": -3.4352753162384033, "loss": 0.5328, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7197036743164062, "rewards/margins": 1.715571641921997, "rewards/rejected": -3.4352753162384033, "sft_loss": 1.7532308101654053, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 13.754293706238059, "learning_rate": 1.08397283106552e-07, "logits/chosen": -0.29204261302948, "logits/rejected": -0.018084803596138954, "logps/chosen": -1.6970350742340088, "logps/rejected": -3.4211604595184326, "loss": 0.5056, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.6970350742340088, "rewards/margins": 1.7241252660751343, "rewards/rejected": -3.4211604595184326, "sft_loss": 1.7624849081039429, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 10.523455836760576, "learning_rate": 1.0666018541448442e-07, "logits/chosen": -0.19858674705028534, "logits/rejected": -0.20855844020843506, "logps/chosen": -1.7655675411224365, "logps/rejected": -3.197678804397583, "loss": 0.561, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7655675411224365, "rewards/margins": 1.4321115016937256, "rewards/rejected": -3.197678804397583, "sft_loss": 1.834460973739624, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 11.925629650215713, "learning_rate": 1.0493660591140919e-07, "logits/chosen": -0.15942314267158508, "logits/rejected": -0.09754550457000732, "logps/chosen": -1.831282615661621, "logps/rejected": -3.3784000873565674, "loss": 0.5714, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.831282615661621, "rewards/margins": 1.5471172332763672, "rewards/rejected": -3.3784000873565674, "sft_loss": 1.872881293296814, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 11.29527891765967, "learning_rate": 1.0322656131950165e-07, "logits/chosen": -0.05539718270301819, "logits/rejected": 0.03757456690073013, "logps/chosen": -1.790489912033081, "logps/rejected": -3.222160816192627, "loss": 0.5455, "rewards/accuracies": 0.84375, "rewards/chosen": -1.790489912033081, "rewards/margins": 1.431671142578125, "rewards/rejected": -3.222160816192627, "sft_loss": 1.7970340251922607, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 14.081814143998702, "learning_rate": 1.0153006822962246e-07, "logits/chosen": -0.029426341876387596, "logits/rejected": 0.08542537689208984, "logps/chosen": -1.8488479852676392, "logps/rejected": -3.433168888092041, "loss": 0.5694, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8488479852676392, "rewards/margins": 1.5843212604522705, "rewards/rejected": -3.433168888092041, "sft_loss": 1.8777506351470947, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 15.286550642554765, "learning_rate": 9.984714310115434e-08, "logits/chosen": -0.17160436511039734, "logits/rejected": -0.05323296785354614, "logps/chosen": -1.9492241144180298, "logps/rejected": -3.5698063373565674, "loss": 0.5472, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.9492241144180298, "rewards/margins": 1.6205823421478271, "rewards/rejected": -3.5698063373565674, "sft_loss": 1.795431137084961, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 19.699631445509663, "learning_rate": 9.817780226184509e-08, "logits/chosen": -0.21670648455619812, "logits/rejected": 0.16964387893676758, "logps/chosen": -1.7380412817001343, "logps/rejected": -3.4298624992370605, "loss": 0.5178, "rewards/accuracies": 0.875, "rewards/chosen": -1.7380412817001343, "rewards/margins": 1.6918213367462158, "rewards/rejected": -3.4298624992370605, "sft_loss": 1.7912161350250244, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 9.77781571435363, "learning_rate": 9.652206190764611e-08, "logits/chosen": -0.2179642617702484, "logits/rejected": 0.004683566279709339, "logps/chosen": -1.7119777202606201, "logps/rejected": -3.185180187225342, "loss": 0.5484, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7119777202606201, "rewards/margins": 1.4732027053833008, "rewards/rejected": -3.185180187225342, "sft_loss": 1.7350578308105469, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 13.703589343704314, "learning_rate": 9.487993810255823e-08, "logits/chosen": -0.1822083592414856, "logits/rejected": -0.05674133449792862, "logps/chosen": -1.7465932369232178, "logps/rejected": -3.5221877098083496, "loss": 0.5369, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7465932369232178, "rewards/margins": 1.775594711303711, "rewards/rejected": -3.5221877098083496, "sft_loss": 1.764044165611267, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 16.015438120699894, "learning_rate": 9.325144677847325e-08, "logits/chosen": -0.1478969156742096, "logits/rejected": 0.017214369028806686, "logps/chosen": -1.8455041646957397, "logps/rejected": -3.473567247390747, "loss": 0.5391, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.8455041646957397, "rewards/margins": 1.6280628442764282, "rewards/rejected": -3.473567247390747, "sft_loss": 1.917488694190979, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 13.2177412605901, "learning_rate": 9.163660373502158e-08, "logits/chosen": 0.07100075483322144, "logits/rejected": -0.0008317396277561784, "logps/chosen": -1.8566402196884155, "logps/rejected": -3.364086866378784, "loss": 0.5781, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8566402196884155, "rewards/margins": 1.5074464082717896, "rewards/rejected": -3.364086866378784, "sft_loss": 1.8414785861968994, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 26.07751623268177, "learning_rate": 9.003542463941711e-08, "logits/chosen": -0.010251370258629322, "logits/rejected": -0.03044726513326168, "logps/chosen": -1.6916160583496094, "logps/rejected": -3.302567720413208, "loss": 0.5536, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6916160583496094, "rewards/margins": 1.6109517812728882, "rewards/rejected": -3.302567720413208, "sft_loss": 1.7145153284072876, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 8.841371119449688, "learning_rate": 8.844792502630705e-08, "logits/chosen": -0.12310369312763214, "logits/rejected": -0.0008263051277026534, "logps/chosen": -1.614935278892517, "logps/rejected": -3.2148826122283936, "loss": 0.5031, "rewards/accuracies": 0.875, "rewards/chosen": -1.614935278892517, "rewards/margins": 1.599947214126587, "rewards/rejected": -3.2148826122283936, "sft_loss": 1.6776469945907593, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 9.37571104363395, "learning_rate": 8.687412029761866e-08, "logits/chosen": -0.2811339497566223, "logits/rejected": -0.1394878774881363, "logps/chosen": -1.6294755935668945, "logps/rejected": -3.3096911907196045, "loss": 0.5056, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.6294755935668945, "rewards/margins": 1.680215835571289, "rewards/rejected": -3.3096911907196045, "sft_loss": 1.667616605758667, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 12.865506560573404, "learning_rate": 8.531402572241325e-08, "logits/chosen": -0.09491724520921707, "logits/rejected": 0.00872437097132206, "logps/chosen": -1.6955277919769287, "logps/rejected": -3.08903169631958, "loss": 0.5858, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6955277919769287, "rewards/margins": 1.3935037851333618, "rewards/rejected": -3.08903169631958, "sft_loss": 1.7838642597198486, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 16.708463944884052, "learning_rate": 8.376765643673462e-08, "logits/chosen": -0.15276072919368744, "logits/rejected": 0.24526353180408478, "logps/chosen": -1.7361557483673096, "logps/rejected": -3.1856894493103027, "loss": 0.5386, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7361557483673096, "rewards/margins": 1.4495340585708618, "rewards/rejected": -3.1856894493103027, "sft_loss": 1.7658565044403076, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 9.915045609735312, "learning_rate": 8.223502744346484e-08, "logits/chosen": -0.04342980682849884, "logits/rejected": 0.1702241152524948, "logps/chosen": -1.6896997690200806, "logps/rejected": -3.071929454803467, "loss": 0.5622, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6896997690200806, "rewards/margins": 1.3822300434112549, "rewards/rejected": -3.071929454803467, "sft_loss": 1.7686045169830322, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 17.72415181893815, "learning_rate": 8.071615361217648e-08, "logits/chosen": -0.13096554577350616, "logits/rejected": -0.029258519411087036, "logps/chosen": -1.664790391921997, "logps/rejected": -2.8216605186462402, "loss": 0.6285, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.664790391921997, "rewards/margins": 1.1568701267242432, "rewards/rejected": -2.8216605186462402, "sft_loss": 1.7525379657745361, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 11.68405514912924, "learning_rate": 7.92110496789909e-08, "logits/chosen": -0.2031552791595459, "logits/rejected": 0.032610904425382614, "logps/chosen": -1.7019561529159546, "logps/rejected": -3.2062244415283203, "loss": 0.5326, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7019561529159546, "rewards/margins": 1.5042685270309448, "rewards/rejected": -3.2062244415283203, "sft_loss": 1.7459650039672852, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 15.34951904415885, "learning_rate": 7.771973024643241e-08, "logits/chosen": -0.2016465663909912, "logits/rejected": -0.010524836368858814, "logps/chosen": -1.6697314977645874, "logps/rejected": -3.583134174346924, "loss": 0.4503, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.6697314977645874, "rewards/margins": 1.913402795791626, "rewards/rejected": -3.583134174346924, "sft_loss": 1.6881519556045532, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 14.828298746181426, "learning_rate": 7.624220978328905e-08, "logits/chosen": -0.3352576792240143, "logits/rejected": -0.045596785843372345, "logps/chosen": -1.755860686302185, "logps/rejected": -3.416865825653076, "loss": 0.5331, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.755860686302185, "rewards/margins": 1.6610050201416016, "rewards/rejected": -3.416865825653076, "sft_loss": 1.8021215200424194, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 11.251154843255891, "learning_rate": 7.477850262447056e-08, "logits/chosen": -0.30058687925338745, "logits/rejected": 0.04834729805588722, "logps/chosen": -1.6716951131820679, "logps/rejected": -3.47686505317688, "loss": 0.5118, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.6716951131820679, "rewards/margins": 1.8051700592041016, "rewards/rejected": -3.47686505317688, "sft_loss": 1.7458328008651733, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 11.258024242328093, "learning_rate": 7.332862297087073e-08, "logits/chosen": -0.08056751638650894, "logits/rejected": 0.16363118588924408, "logps/chosen": -1.72183358669281, "logps/rejected": -3.6885628700256348, "loss": 0.5159, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.72183358669281, "rewards/margins": 1.9667285680770874, "rewards/rejected": -3.6885628700256348, "sft_loss": 1.7538646459579468, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 15.774492797225584, "learning_rate": 7.189258488922768e-08, "logits/chosen": -0.0924944058060646, "logits/rejected": 0.16677220165729523, "logps/chosen": -1.7519547939300537, "logps/rejected": -3.342869997024536, "loss": 0.5233, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.7519547939300537, "rewards/margins": 1.590915322303772, "rewards/rejected": -3.342869997024536, "sft_loss": 1.78385329246521, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 14.513797741987641, "learning_rate": 7.047040231198959e-08, "logits/chosen": -0.16937026381492615, "logits/rejected": 0.014137727208435535, "logps/chosen": -1.7294788360595703, "logps/rejected": -3.3136649131774902, "loss": 0.5617, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7294788360595703, "rewards/margins": 1.5841859579086304, "rewards/rejected": -3.3136649131774902, "sft_loss": 1.752410650253296, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 13.259953642172649, "learning_rate": 6.906208903717787e-08, "logits/chosen": -0.27336111664772034, "logits/rejected": 0.07937004417181015, "logps/chosen": -1.7311985492706299, "logps/rejected": -3.4800212383270264, "loss": 0.5154, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7311985492706299, "rewards/margins": 1.7488229274749756, "rewards/rejected": -3.4800212383270264, "sft_loss": 1.7407493591308594, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 17.246375245141014, "learning_rate": 6.76676587282542e-08, "logits/chosen": -0.18381229043006897, "logits/rejected": -0.07914048433303833, "logps/chosen": -1.8589649200439453, "logps/rejected": -3.4577198028564453, "loss": 0.5432, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.8589649200439453, "rewards/margins": 1.598755121231079, "rewards/rejected": -3.4577198028564453, "sft_loss": 1.891333818435669, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 16.4748277084828, "learning_rate": 6.628712491398736e-08, "logits/chosen": -0.33703577518463135, "logits/rejected": 0.010515051893889904, "logps/chosen": -1.694253921508789, "logps/rejected": -3.272106170654297, "loss": 0.5357, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.694253921508789, "rewards/margins": 1.5778522491455078, "rewards/rejected": -3.272106170654297, "sft_loss": 1.8197418451309204, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 13.145713348716752, "learning_rate": 6.492050098832281e-08, "logits/chosen": -0.33598920702934265, "logits/rejected": 0.005089169833809137, "logps/chosen": -1.788678765296936, "logps/rejected": -3.5194950103759766, "loss": 0.5391, "rewards/accuracies": 0.84375, "rewards/chosen": -1.788678765296936, "rewards/margins": 1.7308164834976196, "rewards/rejected": -3.5194950103759766, "sft_loss": 1.8638890981674194, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 15.523976451818084, "learning_rate": 6.356780021025161e-08, "logits/chosen": -0.018414665013551712, "logits/rejected": 0.07146365940570831, "logps/chosen": -1.743440866470337, "logps/rejected": -3.2033050060272217, "loss": 0.5697, "rewards/accuracies": 0.84375, "rewards/chosen": -1.743440866470337, "rewards/margins": 1.4598641395568848, "rewards/rejected": -3.2033050060272217, "sft_loss": 1.8230106830596924, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 8.28765342796906, "learning_rate": 6.222903570368288e-08, "logits/chosen": -0.09992040693759918, "logits/rejected": 0.11682212352752686, "logps/chosen": -1.8085787296295166, "logps/rejected": -3.3187828063964844, "loss": 0.5627, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.8085787296295166, "rewards/margins": 1.5102041959762573, "rewards/rejected": -3.3187828063964844, "sft_loss": 1.8448479175567627, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 12.553216743920622, "learning_rate": 6.090422045731525e-08, "logits/chosen": -0.11003341525793076, "logits/rejected": 0.10952264070510864, "logps/chosen": -1.7515653371810913, "logps/rejected": -3.2995238304138184, "loss": 0.5639, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7515653371810913, "rewards/margins": 1.5479586124420166, "rewards/rejected": -3.2995238304138184, "sft_loss": 1.832606315612793, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 21.07056459916169, "learning_rate": 5.9593367324512593e-08, "logits/chosen": -0.203813835978508, "logits/rejected": -0.0018882930744439363, "logps/chosen": -1.6920894384384155, "logps/rejected": -3.2762725353240967, "loss": 0.5429, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6920894384384155, "rewards/margins": 1.5841830968856812, "rewards/rejected": -3.2762725353240967, "sft_loss": 1.742953896522522, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 11.540131753363577, "learning_rate": 5.8296489023177305e-08, "logits/chosen": -0.22834794223308563, "logits/rejected": -0.0977824255824089, "logps/chosen": -1.820873498916626, "logps/rejected": -3.3625118732452393, "loss": 0.5374, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.820873498916626, "rewards/margins": 1.5416386127471924, "rewards/rejected": -3.3625118732452393, "sft_loss": 1.9106184244155884, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 6.092889140852963, "learning_rate": 5.7013598135628895e-08, "logits/chosen": -0.09903047978878021, "logits/rejected": -0.05335770174860954, "logps/chosen": -1.6918855905532837, "logps/rejected": -3.4326744079589844, "loss": 0.5241, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.6918855905532837, "rewards/margins": 1.740788459777832, "rewards/rejected": -3.4326744079589844, "sft_loss": 1.7823829650878906, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 11.178889716782178, "learning_rate": 5.5744707108479784e-08, "logits/chosen": -0.15962809324264526, "logits/rejected": 0.14945785701274872, "logps/chosen": -1.6994130611419678, "logps/rejected": -3.3238296508789062, "loss": 0.5323, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6994130611419678, "rewards/margins": 1.6244163513183594, "rewards/rejected": -3.3238296508789062, "sft_loss": 1.7236995697021484, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 12.037210771580455, "learning_rate": 5.448982825251686e-08, "logits/chosen": -0.16476576030254364, "logits/rejected": 0.023226696997880936, "logps/chosen": -1.7827539443969727, "logps/rejected": -3.496924638748169, "loss": 0.5224, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7827539443969727, "rewards/margins": 1.7141706943511963, "rewards/rejected": -3.496924638748169, "sft_loss": 1.888169527053833, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 13.048210566572077, "learning_rate": 5.324897374257959e-08, "logits/chosen": -0.12402637302875519, "logits/rejected": -0.0067833187058568, "logps/chosen": -1.8013607263565063, "logps/rejected": -3.5576987266540527, "loss": 0.5134, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.8013607263565063, "rewards/margins": 1.756338119506836, "rewards/rejected": -3.5576987266540527, "sft_loss": 1.7914527654647827, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 13.314492118569552, "learning_rate": 5.202215561744461e-08, "logits/chosen": -0.056083958595991135, "logits/rejected": 0.0518193356692791, "logps/chosen": -1.8286384344100952, "logps/rejected": -3.307530164718628, "loss": 0.5716, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.8286384344100952, "rewards/margins": 1.478891372680664, "rewards/rejected": -3.307530164718628, "sft_loss": 1.9314041137695312, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 9.782592586506926, "learning_rate": 5.080938577970617e-08, "logits/chosen": -0.1079416424036026, "logits/rejected": 0.12869928777217865, "logps/chosen": -1.6902978420257568, "logps/rejected": -3.489121913909912, "loss": 0.5593, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6902978420257568, "rewards/margins": 1.7988240718841553, "rewards/rejected": -3.489121913909912, "sft_loss": 1.781044602394104, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 16.772587561429965, "learning_rate": 4.961067599566305e-08, "logits/chosen": -0.26666679978370667, "logits/rejected": 0.0012994721764698625, "logps/chosen": -1.7255357503890991, "logps/rejected": -3.4891388416290283, "loss": 0.5316, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7255357503890991, "rewards/margins": 1.7636029720306396, "rewards/rejected": -3.4891388416290283, "sft_loss": 1.8183187246322632, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 24.35972300356556, "learning_rate": 4.8426037895202277e-08, "logits/chosen": -0.10224726051092148, "logits/rejected": 0.12815634906291962, "logps/chosen": -1.7485454082489014, "logps/rejected": -3.513535976409912, "loss": 0.502, "rewards/accuracies": 0.875, "rewards/chosen": -1.7485454082489014, "rewards/margins": 1.7649905681610107, "rewards/rejected": -3.513535976409912, "sft_loss": 1.8304626941680908, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 16.647665933842195, "learning_rate": 4.725548297168847e-08, "logits/chosen": -0.2706056237220764, "logits/rejected": -0.015289786271750927, "logps/chosen": -1.7003743648529053, "logps/rejected": -3.4611706733703613, "loss": 0.5281, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7003743648529053, "rewards/margins": 1.7607961893081665, "rewards/rejected": -3.4611706733703613, "sft_loss": 1.7916524410247803, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": 0.4523681402206421, "eval_logits/rejected": 0.6008379459381104, "eval_logps/chosen": -2.066589593887329, "eval_logps/rejected": -3.1211626529693604, "eval_loss": 0.7446607351303101, "eval_rewards/accuracies": 0.7032641172409058, "eval_rewards/chosen": -2.066589593887329, "eval_rewards/margins": 1.0545730590820312, "eval_rewards/rejected": -3.1211626529693604, "eval_runtime": 48.5224, "eval_samples_per_second": 27.719, "eval_sft_loss": 1.9814122915267944, "eval_steps_per_second": 6.945, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 9.222960461789413, "learning_rate": 4.609902258185017e-08, "logits/chosen": -0.09446149319410324, "logits/rejected": -0.028069287538528442, "logps/chosen": -1.7660013437271118, "logps/rejected": -3.2970519065856934, "loss": 0.5409, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.7660013437271118, "rewards/margins": 1.531050682067871, "rewards/rejected": -3.2970519065856934, "sft_loss": 1.7716779708862305, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 9.94103612508856, "learning_rate": 4.4956667945671496e-08, "logits/chosen": -0.17274287343025208, "logits/rejected": 0.008575853891670704, "logps/chosen": -1.743222951889038, "logps/rejected": -3.646514415740967, "loss": 0.4995, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.743222951889038, "rewards/margins": 1.9032917022705078, "rewards/rejected": -3.646514415740967, "sft_loss": 1.7744395732879639, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 12.056605720430364, "learning_rate": 4.382843014628168e-08, "logits/chosen": -0.15395765006542206, "logits/rejected": -0.012604189105331898, "logps/chosen": -1.720470666885376, "logps/rejected": -3.3299014568328857, "loss": 0.5348, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.720470666885376, "rewards/margins": 1.6094309091567993, "rewards/rejected": -3.3299014568328857, "sft_loss": 1.7798278331756592, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 10.60887867025444, "learning_rate": 4.271432012984938e-08, "logits/chosen": -0.1706741452217102, "logits/rejected": -0.04382907226681709, "logps/chosen": -1.7484376430511475, "logps/rejected": -3.6875336170196533, "loss": 0.4884, "rewards/accuracies": 0.875, "rewards/chosen": -1.7484376430511475, "rewards/margins": 1.9390960931777954, "rewards/rejected": -3.6875336170196533, "sft_loss": 1.8463385105133057, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 12.901398096306242, "learning_rate": 4.1614348705474534e-08, "logits/chosen": -0.08455139398574829, "logits/rejected": 0.15452969074249268, "logps/chosen": -1.8069339990615845, "logps/rejected": -3.600315570831299, "loss": 0.5342, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8069339990615845, "rewards/margins": 1.793381690979004, "rewards/rejected": -3.600315570831299, "sft_loss": 1.8492975234985352, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 9.586087613823814, "learning_rate": 4.052852654508482e-08, "logits/chosen": -0.30744099617004395, "logits/rejected": -0.060971058905124664, "logps/chosen": -1.7557405233383179, "logps/rejected": -3.377889633178711, "loss": 0.5243, "rewards/accuracies": 0.875, "rewards/chosen": -1.7557405233383179, "rewards/margins": 1.6221487522125244, "rewards/rejected": -3.377889633178711, "sft_loss": 1.7627824544906616, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 13.932972673755787, "learning_rate": 3.9456864183331557e-08, "logits/chosen": -0.23899254202842712, "logits/rejected": -0.01942940428853035, "logps/chosen": -1.7661815881729126, "logps/rejected": -3.4561429023742676, "loss": 0.5015, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.7661815881729126, "rewards/margins": 1.6899614334106445, "rewards/rejected": -3.4561429023742676, "sft_loss": 1.7765686511993408, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 18.164615491069522, "learning_rate": 3.839937201748744e-08, "logits/chosen": -0.2424912005662918, "logits/rejected": 0.09279437363147736, "logps/chosen": -1.8523527383804321, "logps/rejected": -3.6534671783447266, "loss": 0.5522, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8523527383804321, "rewards/margins": 1.8011143207550049, "rewards/rejected": -3.6534671783447266, "sft_loss": 1.8589980602264404, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 11.459943642977123, "learning_rate": 3.735606030734651e-08, "logits/chosen": -0.16552621126174927, "logits/rejected": -0.05069739744067192, "logps/chosen": -1.7162492275238037, "logps/rejected": -3.163618564605713, "loss": 0.5858, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7162492275238037, "rewards/margins": 1.4473693370819092, "rewards/rejected": -3.163618564605713, "sft_loss": 1.7754487991333008, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 20.628405186287445, "learning_rate": 3.632693917512331e-08, "logits/chosen": -0.25471025705337524, "logits/rejected": -0.04566922411322594, "logps/chosen": -1.8308833837509155, "logps/rejected": -3.5067691802978516, "loss": 0.5753, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.8308833837509155, "rewards/margins": 1.6758854389190674, "rewards/rejected": -3.5067691802978516, "sft_loss": 1.892290711402893, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 11.163614574089873, "learning_rate": 3.531201860535588e-08, "logits/chosen": -0.23860302567481995, "logits/rejected": 0.1082988828420639, "logps/chosen": -1.8234989643096924, "logps/rejected": -3.4302475452423096, "loss": 0.5458, "rewards/accuracies": 0.84375, "rewards/chosen": -1.8234989643096924, "rewards/margins": 1.6067485809326172, "rewards/rejected": -3.4302475452423096, "sft_loss": 1.834429144859314, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 15.540488575948466, "learning_rate": 3.431130844480762e-08, "logits/chosen": -0.11743469536304474, "logits/rejected": -0.007621115539222956, "logps/chosen": -1.7462657690048218, "logps/rejected": -3.396251678466797, "loss": 0.5565, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.7462657690048218, "rewards/margins": 1.6499862670898438, "rewards/rejected": -3.396251678466797, "sft_loss": 1.870222806930542, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 10.605155490489334, "learning_rate": 3.332481840237306e-08, "logits/chosen": -0.3517284691333771, "logits/rejected": 0.02356250211596489, "logps/chosen": -1.9717661142349243, "logps/rejected": -3.6357693672180176, "loss": 0.579, "rewards/accuracies": 0.84375, "rewards/chosen": -1.9717661142349243, "rewards/margins": 1.6640033721923828, "rewards/rejected": -3.6357693672180176, "sft_loss": 2.0109448432922363, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 13.903346355540794, "learning_rate": 3.235255804898307e-08, "logits/chosen": -0.12415225803852081, "logits/rejected": 0.09575439989566803, "logps/chosen": -1.6956020593643188, "logps/rejected": -3.378117799758911, "loss": 0.4932, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.6956020593643188, "rewards/margins": 1.6825157403945923, "rewards/rejected": -3.378117799758911, "sft_loss": 1.7468980550765991, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 13.282523255632926, "learning_rate": 3.1394536817511475e-08, "logits/chosen": -0.13596105575561523, "logits/rejected": 0.08177070319652557, "logps/chosen": -1.8514257669448853, "logps/rejected": -3.500671863555908, "loss": 0.5372, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.8514257669448853, "rewards/margins": 1.6492455005645752, "rewards/rejected": -3.500671863555908, "sft_loss": 1.8731330633163452, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 11.71948389715706, "learning_rate": 3.0450764002684926e-08, "logits/chosen": -0.16972532868385315, "logits/rejected": 0.15765061974525452, "logps/chosen": -1.8777910470962524, "logps/rejected": -3.807753086090088, "loss": 0.496, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.8777910470962524, "rewards/margins": 1.929961919784546, "rewards/rejected": -3.807753086090088, "sft_loss": 1.8987195491790771, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 13.747807261958386, "learning_rate": 2.9521248760991158e-08, "logits/chosen": -0.21182194352149963, "logits/rejected": -0.03415367752313614, "logps/chosen": -1.7448492050170898, "logps/rejected": -3.524975538253784, "loss": 0.523, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.7448492050170898, "rewards/margins": 1.7801263332366943, "rewards/rejected": -3.524975538253784, "sft_loss": 1.7703666687011719, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 8.872612705144913, "learning_rate": 2.8606000110591224e-08, "logits/chosen": -0.1471521556377411, "logits/rejected": 0.08794516324996948, "logps/chosen": -1.7258751392364502, "logps/rejected": -3.201476573944092, "loss": 0.5474, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7258751392364502, "rewards/margins": 1.4756014347076416, "rewards/rejected": -3.201476573944092, "sft_loss": 1.788822889328003, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 8.382828109806677, "learning_rate": 2.770502693123139e-08, "logits/chosen": -0.27462801337242126, "logits/rejected": 0.04605941101908684, "logps/chosen": -1.8597522974014282, "logps/rejected": -3.6685824394226074, "loss": 0.5173, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.8597522974014282, "rewards/margins": 1.8088302612304688, "rewards/rejected": -3.6685824394226074, "sft_loss": 1.9247268438339233, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 14.595090230431929, "learning_rate": 2.6818337964157726e-08, "logits/chosen": -0.13967612385749817, "logits/rejected": -0.043948955833911896, "logps/chosen": -1.7874114513397217, "logps/rejected": -3.5716145038604736, "loss": 0.4983, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.7874114513397217, "rewards/margins": 1.7842029333114624, "rewards/rejected": -3.5716145038604736, "sft_loss": 1.790167212486267, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 12.277514913585133, "learning_rate": 2.5945941812029973e-08, "logits/chosen": -0.1475212275981903, "logits/rejected": 0.059523582458496094, "logps/chosen": -1.814934492111206, "logps/rejected": -3.316126585006714, "loss": 0.574, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.814934492111206, "rewards/margins": 1.5011920928955078, "rewards/rejected": -3.316126585006714, "sft_loss": 1.8950258493423462, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 10.411890019919085, "learning_rate": 2.5087846938839976e-08, "logits/chosen": -0.38515740633010864, "logits/rejected": 0.014054941944777966, "logps/chosen": -1.7699428796768188, "logps/rejected": -3.6067757606506348, "loss": 0.538, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7699428796768188, "rewards/margins": 1.8368332386016846, "rewards/rejected": -3.6067757606506348, "sft_loss": 1.8175370693206787, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 15.027736125517917, "learning_rate": 2.42440616698274e-08, "logits/chosen": -0.10710807144641876, "logits/rejected": 0.15287144482135773, "logps/chosen": -1.7904659509658813, "logps/rejected": -3.299647569656372, "loss": 0.5349, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.7904659509658813, "rewards/margins": 1.5091816186904907, "rewards/rejected": -3.299647569656372, "sft_loss": 1.8567641973495483, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 12.107039402674326, "learning_rate": 2.3414594191401128e-08, "logits/chosen": -0.10171730816364288, "logits/rejected": -0.007555614225566387, "logps/chosen": -1.7247684001922607, "logps/rejected": -3.2738890647888184, "loss": 0.5393, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7247684001922607, "rewards/margins": 1.5491206645965576, "rewards/rejected": -3.2738890647888184, "sft_loss": 1.7477623224258423, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 8.953171514879182, "learning_rate": 2.2599452551057998e-08, "logits/chosen": -0.12288618087768555, "logits/rejected": 0.1192425936460495, "logps/chosen": -1.8108208179473877, "logps/rejected": -3.600740432739258, "loss": 0.4982, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8108208179473877, "rewards/margins": 1.7899196147918701, "rewards/rejected": -3.600740432739258, "sft_loss": 1.8616046905517578, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 17.200615482379387, "learning_rate": 2.1798644657305857e-08, "logits/chosen": -0.02967996522784233, "logits/rejected": 0.12160022556781769, "logps/chosen": -1.6938245296478271, "logps/rejected": -3.458918333053589, "loss": 0.5273, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.6938245296478271, "rewards/margins": 1.7650935649871826, "rewards/rejected": -3.458918333053589, "sft_loss": 1.7733080387115479, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 10.512460460524174, "learning_rate": 2.1012178279586293e-08, "logits/chosen": 0.0147629976272583, "logits/rejected": -0.05085957795381546, "logps/chosen": -1.692972183227539, "logps/rejected": -3.099750518798828, "loss": 0.5882, "rewards/accuracies": 0.8125, "rewards/chosen": -1.692972183227539, "rewards/margins": 1.406778335571289, "rewards/rejected": -3.099750518798828, "sft_loss": 1.752136468887329, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 18.087881816117683, "learning_rate": 2.02400610481997e-08, "logits/chosen": -0.0952359288930893, "logits/rejected": -0.03134078532457352, "logps/chosen": -1.7446550130844116, "logps/rejected": -3.2378089427948, "loss": 0.5408, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7446550130844116, "rewards/margins": 1.4931542873382568, "rewards/rejected": -3.2378089427948, "sft_loss": 1.7278072834014893, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 13.153300138418738, "learning_rate": 1.948230045423083e-08, "logits/chosen": -0.27102628350257874, "logits/rejected": 0.03965846449136734, "logps/chosen": -1.6783840656280518, "logps/rejected": -3.341630220413208, "loss": 0.5003, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.6783840656280518, "rewards/margins": 1.6632461547851562, "rewards/rejected": -3.341630220413208, "sft_loss": 1.7147563695907593, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 13.974310829234309, "learning_rate": 1.8738903849476186e-08, "logits/chosen": -0.037626512348651886, "logits/rejected": -0.05883268639445305, "logps/chosen": -1.848806381225586, "logps/rejected": -3.4039673805236816, "loss": 0.571, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.848806381225586, "rewards/margins": 1.5551612377166748, "rewards/rejected": -3.4039673805236816, "sft_loss": 1.8302112817764282, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 11.59311245393821, "learning_rate": 1.8009878446373083e-08, "logits/chosen": -0.17010925710201263, "logits/rejected": -0.07235264778137207, "logps/chosen": -1.8091201782226562, "logps/rejected": -3.298564910888672, "loss": 0.5749, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8091201782226562, "rewards/margins": 1.4894448518753052, "rewards/rejected": -3.298564910888672, "sft_loss": 1.8607683181762695, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 13.035037630640272, "learning_rate": 1.729523131792887e-08, "logits/chosen": -0.21425171196460724, "logits/rejected": 0.10696268081665039, "logps/chosen": -1.7733795642852783, "logps/rejected": -3.222611904144287, "loss": 0.5998, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7733795642852783, "rewards/margins": 1.4492326974868774, "rewards/rejected": -3.222611904144287, "sft_loss": 1.8633842468261719, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 9.402984291729082, "learning_rate": 1.6594969397653316e-08, "logits/chosen": -0.2268737107515335, "logits/rejected": 0.0147031145170331, "logps/chosen": -1.7411689758300781, "logps/rejected": -3.4823131561279297, "loss": 0.5154, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.7411689758300781, "rewards/margins": 1.7411444187164307, "rewards/rejected": -3.4823131561279297, "sft_loss": 1.8098264932632446, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 15.404025802231084, "learning_rate": 1.5909099479490653e-08, "logits/chosen": -0.043330464512109756, "logits/rejected": -0.0020337612368166447, "logps/chosen": -1.748051643371582, "logps/rejected": -3.0739541053771973, "loss": 0.5648, "rewards/accuracies": 0.84375, "rewards/chosen": -1.748051643371582, "rewards/margins": 1.3259022235870361, "rewards/rejected": -3.0739541053771973, "sft_loss": 1.7951023578643799, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 16.918432357629232, "learning_rate": 1.5237628217753818e-08, "logits/chosen": -0.12749573588371277, "logits/rejected": -0.018352797254920006, "logps/chosen": -1.6635348796844482, "logps/rejected": -3.588423252105713, "loss": 0.519, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.6635348796844482, "rewards/margins": 1.924888253211975, "rewards/rejected": -3.588423252105713, "sft_loss": 1.7674754858016968, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 11.488033708406595, "learning_rate": 1.4580562127059994e-08, "logits/chosen": -0.24728181958198547, "logits/rejected": 0.15568208694458008, "logps/chosen": -1.9034550189971924, "logps/rejected": -3.6897387504577637, "loss": 0.5484, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.9034550189971924, "rewards/margins": 1.7862837314605713, "rewards/rejected": -3.6897387504577637, "sft_loss": 1.9504950046539307, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 8.791458916777316, "learning_rate": 1.3937907582267151e-08, "logits/chosen": -0.05900830030441284, "logits/rejected": 0.0781247466802597, "logps/chosen": -1.6860978603363037, "logps/rejected": -3.2836456298828125, "loss": 0.5213, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.6860978603363037, "rewards/margins": 1.5975478887557983, "rewards/rejected": -3.2836456298828125, "sft_loss": 1.7643423080444336, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 13.08532454428947, "learning_rate": 1.3309670818412446e-08, "logits/chosen": -0.1450091302394867, "logits/rejected": 0.0737738236784935, "logps/chosen": -1.845995545387268, "logps/rejected": -3.287126064300537, "loss": 0.5868, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.845995545387268, "rewards/margins": 1.4411306381225586, "rewards/rejected": -3.287126064300537, "sft_loss": 1.9166057109832764, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 11.68997742530587, "learning_rate": 1.2695857930651921e-08, "logits/chosen": -0.3569498658180237, "logits/rejected": 0.0017510965699329972, "logps/chosen": -1.696563959121704, "logps/rejected": -3.3799500465393066, "loss": 0.4984, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.696563959121704, "rewards/margins": 1.683386206626892, "rewards/rejected": -3.3799500465393066, "sft_loss": 1.765641450881958, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 11.348280682159059, "learning_rate": 1.2096474874200735e-08, "logits/chosen": -0.2573717534542084, "logits/rejected": 0.13519421219825745, "logps/chosen": -1.754091501235962, "logps/rejected": -3.702437162399292, "loss": 0.4887, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.754091501235962, "rewards/margins": 1.9483455419540405, "rewards/rejected": -3.702437162399292, "sft_loss": 1.761106252670288, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 14.581538154666081, "learning_rate": 1.1511527464276194e-08, "logits/chosen": -0.04870340973138809, "logits/rejected": 0.03849010914564133, "logps/chosen": -1.8830690383911133, "logps/rejected": -3.5184459686279297, "loss": 0.5275, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.8830690383911133, "rewards/margins": 1.6353766918182373, "rewards/rejected": -3.5184459686279297, "sft_loss": 1.9118648767471313, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 8.768613582240272, "learning_rate": 1.0941021376040305e-08, "logits/chosen": -0.12008903920650482, "logits/rejected": 0.060319773852825165, "logps/chosen": -1.724844217300415, "logps/rejected": -3.5657737255096436, "loss": 0.5389, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.724844217300415, "rewards/margins": 1.8409297466278076, "rewards/rejected": -3.5657737255096436, "sft_loss": 1.7985265254974365, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 11.409485948184829, "learning_rate": 1.0384962144545818e-08, "logits/chosen": -0.20349383354187012, "logits/rejected": 0.09238765388727188, "logps/chosen": -1.806945562362671, "logps/rejected": -3.303162097930908, "loss": 0.5579, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.806945562362671, "rewards/margins": 1.4962167739868164, "rewards/rejected": -3.303162097930908, "sft_loss": 1.8973405361175537, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 10.82064851844795, "learning_rate": 9.843355164681767e-09, "logits/chosen": -0.12326931953430176, "logits/rejected": -0.015838781371712685, "logps/chosen": -1.7402493953704834, "logps/rejected": -3.36765718460083, "loss": 0.5728, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7402493953704834, "rewards/margins": 1.627407431602478, "rewards/rejected": -3.36765718460083, "sft_loss": 1.8054853677749634, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 12.522878903507149, "learning_rate": 9.316205691121515e-09, "logits/chosen": -0.125594824552536, "logits/rejected": 0.11583630740642548, "logps/chosen": -1.8031196594238281, "logps/rejected": -3.636901378631592, "loss": 0.4952, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.8031196594238281, "rewards/margins": 1.8337819576263428, "rewards/rejected": -3.636901378631592, "sft_loss": 1.8276653289794922, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 11.717915538770692, "learning_rate": 8.803518838271463e-09, "logits/chosen": -0.19515851140022278, "logits/rejected": 0.07911201566457748, "logps/chosen": -1.7605504989624023, "logps/rejected": -3.460129976272583, "loss": 0.4958, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.7605504989624023, "rewards/margins": 1.6995792388916016, "rewards/rejected": -3.460129976272583, "sft_loss": 1.8053300380706787, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 10.57249049296401, "learning_rate": 8.305299580221748e-09, "logits/chosen": -0.23829343914985657, "logits/rejected": -0.08533582836389542, "logps/chosen": -1.7042344808578491, "logps/rejected": -3.46440052986145, "loss": 0.518, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7042344808578491, "rewards/margins": 1.7601664066314697, "rewards/rejected": -3.46440052986145, "sft_loss": 1.8231385946273804, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 8.628673797910038, "learning_rate": 7.821552750697958e-09, "logits/chosen": -0.2571510374546051, "logits/rejected": -0.014047443866729736, "logps/chosen": -1.7087123394012451, "logps/rejected": -3.186110496520996, "loss": 0.5563, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7087123394012451, "rewards/margins": 1.4773979187011719, "rewards/rejected": -3.186110496520996, "sft_loss": 1.7801802158355713, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 12.093261360034742, "learning_rate": 7.3522830430136635e-09, "logits/chosen": 0.09196645021438599, "logits/rejected": 0.17079466581344604, "logps/chosen": -1.8431581258773804, "logps/rejected": -3.837364673614502, "loss": 0.5346, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.8431581258773804, "rewards/margins": 1.994206190109253, "rewards/rejected": -3.837364673614502, "sft_loss": 1.8825957775115967, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 20.621534055002574, "learning_rate": 6.897495010025956e-09, "logits/chosen": 0.05238935351371765, "logits/rejected": 0.17671489715576172, "logps/chosen": -1.8023525476455688, "logps/rejected": -3.4923388957977295, "loss": 0.5297, "rewards/accuracies": 0.875, "rewards/chosen": -1.8023525476455688, "rewards/margins": 1.689986228942871, "rewards/rejected": -3.4923388957977295, "sft_loss": 1.8439724445343018, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 12.591348257745087, "learning_rate": 6.4571930640899835e-09, "logits/chosen": -0.2012520730495453, "logits/rejected": 0.06406156718730927, "logps/chosen": -1.826565146446228, "logps/rejected": -3.2655797004699707, "loss": 0.5732, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.826565146446228, "rewards/margins": 1.4390145540237427, "rewards/rejected": -3.2655797004699707, "sft_loss": 1.8569958209991455, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 11.981205556368744, "learning_rate": 6.0313814770174836e-09, "logits/chosen": -0.14899098873138428, "logits/rejected": 0.07124531269073486, "logps/chosen": -1.7477327585220337, "logps/rejected": -3.3878090381622314, "loss": 0.5441, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7477327585220337, "rewards/margins": 1.6400762796401978, "rewards/rejected": -3.3878090381622314, "sft_loss": 1.8257734775543213, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 9.461939009323746, "learning_rate": 5.620064380033985e-09, "logits/chosen": -0.2407839298248291, "logits/rejected": 0.11912363767623901, "logps/chosen": -1.8589773178100586, "logps/rejected": -3.3650786876678467, "loss": 0.5388, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.8589773178100586, "rewards/margins": 1.5061014890670776, "rewards/rejected": -3.3650786876678467, "sft_loss": 1.8304802179336548, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 15.330362075232408, "learning_rate": 5.22324576374017e-09, "logits/chosen": -0.14247074723243713, "logits/rejected": -0.023439304903149605, "logps/chosen": -1.7721531391143799, "logps/rejected": -3.2598586082458496, "loss": 0.5588, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7721531391143799, "rewards/margins": 1.4877058267593384, "rewards/rejected": -3.2598586082458496, "sft_loss": 1.8422420024871826, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 15.33512833982946, "learning_rate": 4.840929478071576e-09, "logits/chosen": -0.08151860535144806, "logits/rejected": -0.15125760436058044, "logps/chosen": -1.6786878108978271, "logps/rejected": -3.1958765983581543, "loss": 0.5362, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6786878108978271, "rewards/margins": 1.5171887874603271, "rewards/rejected": -3.1958765983581543, "sft_loss": 1.7440593242645264, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 17.120913980937942, "learning_rate": 4.47311923226279e-09, "logits/chosen": -0.14595165848731995, "logits/rejected": 0.05748923867940903, "logps/chosen": -1.7532291412353516, "logps/rejected": -3.318749189376831, "loss": 0.568, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7532291412353516, "rewards/margins": 1.5655204057693481, "rewards/rejected": -3.318749189376831, "sft_loss": 1.8259124755859375, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 10.292380804604246, "learning_rate": 4.119818594810476e-09, "logits/chosen": -0.04360206052660942, "logits/rejected": 0.24839358031749725, "logps/chosen": -1.686971664428711, "logps/rejected": -3.217381238937378, "loss": 0.5255, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.686971664428711, "rewards/margins": 1.530409574508667, "rewards/rejected": -3.217381238937378, "sft_loss": 1.7612766027450562, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 13.509652282634574, "learning_rate": 3.781030993438573e-09, "logits/chosen": -0.1148734912276268, "logits/rejected": -0.03746723383665085, "logps/chosen": -1.7142328023910522, "logps/rejected": -3.359126329421997, "loss": 0.5359, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7142328023910522, "rewards/margins": 1.6448932886123657, "rewards/rejected": -3.359126329421997, "sft_loss": 1.8104727268218994, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 12.38245292114047, "learning_rate": 3.4567597150663155e-09, "logits/chosen": -0.2548621594905853, "logits/rejected": 0.08878061920404434, "logps/chosen": -1.7587592601776123, "logps/rejected": -3.5690054893493652, "loss": 0.5159, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.7587592601776123, "rewards/margins": 1.810246467590332, "rewards/rejected": -3.5690054893493652, "sft_loss": 1.8457529544830322, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 9.47868750971961, "learning_rate": 3.147007905774768e-09, "logits/chosen": -0.04488161578774452, "logits/rejected": 0.1181039810180664, "logps/chosen": -1.8180328607559204, "logps/rejected": -3.4446120262145996, "loss": 0.5414, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8180328607559204, "rewards/margins": 1.6265792846679688, "rewards/rejected": -3.4446120262145996, "sft_loss": 1.8337246179580688, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 14.651113259446056, "learning_rate": 2.851778570777508e-09, "logits/chosen": -0.05829663202166557, "logits/rejected": -0.03663646802306175, "logps/chosen": -1.7952572107315063, "logps/rejected": -3.320112705230713, "loss": 0.5477, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.7952572107315063, "rewards/margins": 1.524855375289917, "rewards/rejected": -3.320112705230713, "sft_loss": 1.843465805053711, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 9.673833307967234, "learning_rate": 2.5710745743908192e-09, "logits/chosen": -0.15737508237361908, "logits/rejected": 0.07274798303842545, "logps/chosen": -1.8004143238067627, "logps/rejected": -3.758871078491211, "loss": 0.521, "rewards/accuracies": 0.84375, "rewards/chosen": -1.8004143238067627, "rewards/margins": 1.9584567546844482, "rewards/rejected": -3.758871078491211, "sft_loss": 1.8147557973861694, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 15.459388243890075, "learning_rate": 2.304898640006048e-09, "logits/chosen": -0.2569485306739807, "logits/rejected": -0.02802509441971779, "logps/chosen": -1.707179307937622, "logps/rejected": -3.340653896331787, "loss": 0.5397, "rewards/accuracies": 0.84375, "rewards/chosen": -1.707179307937622, "rewards/margins": 1.633474588394165, "rewards/rejected": -3.340653896331787, "sft_loss": 1.824716329574585, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 16.619593710988614, "learning_rate": 2.0532533500631225e-09, "logits/chosen": -0.1367146372795105, "logits/rejected": -0.025258731096982956, "logps/chosen": -1.7326462268829346, "logps/rejected": -3.215005874633789, "loss": 0.5669, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7326462268829346, "rewards/margins": 1.482359528541565, "rewards/rejected": -3.215005874633789, "sft_loss": 1.795987844467163, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 18.319373164749003, "learning_rate": 1.8161411460262401e-09, "logits/chosen": -0.12124613672494888, "logits/rejected": 0.10767862945795059, "logps/chosen": -1.8736505508422852, "logps/rejected": -3.7418556213378906, "loss": 0.5365, "rewards/accuracies": 0.84375, "rewards/chosen": -1.8736505508422852, "rewards/margins": 1.8682053089141846, "rewards/rejected": -3.7418556213378906, "sft_loss": 1.906241774559021, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 12.831447954821886, "learning_rate": 1.5935643283585545e-09, "logits/chosen": -0.2294524610042572, "logits/rejected": 0.14085766673088074, "logps/chosen": -1.854288101196289, "logps/rejected": -3.3342108726501465, "loss": 0.546, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.854288101196289, "rewards/margins": 1.4799230098724365, "rewards/rejected": -3.3342108726501465, "sft_loss": 1.9192848205566406, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 16.18105063589966, "learning_rate": 1.3855250565015244e-09, "logits/chosen": -0.09225011616945267, "logits/rejected": -0.02986457571387291, "logps/chosen": -1.7413854598999023, "logps/rejected": -3.3444411754608154, "loss": 0.5748, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7413854598999023, "rewards/margins": 1.6030553579330444, "rewards/rejected": -3.3444411754608154, "sft_loss": 1.8075097799301147, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 12.385490252476076, "learning_rate": 1.1920253488530986e-09, "logits/chosen": -0.29104679822921753, "logits/rejected": -0.04165268689393997, "logps/chosen": -1.7731249332427979, "logps/rejected": -3.362682819366455, "loss": 0.5416, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7731249332427979, "rewards/margins": 1.5895576477050781, "rewards/rejected": -3.362682819366455, "sft_loss": 1.7579162120819092, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 11.035865192919763, "learning_rate": 1.0130670827482314e-09, "logits/chosen": -0.1392376571893692, "logits/rejected": 0.03444616496562958, "logps/chosen": -1.7362483739852905, "logps/rejected": -3.165205478668213, "loss": 0.5506, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7362483739852905, "rewards/margins": 1.4289571046829224, "rewards/rejected": -3.165205478668213, "sft_loss": 1.7826095819473267, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 13.187865717975045, "learning_rate": 8.4865199444073e-10, "logits/chosen": -0.028293948620557785, "logits/rejected": 0.07021278142929077, "logps/chosen": -1.8036301136016846, "logps/rejected": -3.4158120155334473, "loss": 0.5414, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.8036301136016846, "rewards/margins": 1.6121822595596313, "rewards/rejected": -3.4158120155334473, "sft_loss": 1.844257116317749, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 16.444463729797935, "learning_rate": 6.987816790866019e-10, "logits/chosen": -0.12553277611732483, "logits/rejected": 0.1887044906616211, "logps/chosen": -1.8647096157073975, "logps/rejected": -3.710162401199341, "loss": 0.5544, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8647096157073975, "rewards/margins": 1.845452904701233, "rewards/rejected": -3.710162401199341, "sft_loss": 1.8833541870117188, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 10.646863416876881, "learning_rate": 5.634575907284001e-10, "logits/chosen": -0.027017977088689804, "logits/rejected": -0.0031074334401637316, "logps/chosen": -1.7922757863998413, "logps/rejected": -3.2982144355773926, "loss": 0.5833, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7922757863998413, "rewards/margins": 1.5059386491775513, "rewards/rejected": -3.2982144355773926, "sft_loss": 1.8631671667099, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 10.595165009454474, "learning_rate": 4.426810422809013e-10, "logits/chosen": -0.19915366172790527, "logits/rejected": -0.05446736887097359, "logps/chosen": -1.6954681873321533, "logps/rejected": -3.2648377418518066, "loss": 0.5377, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6954681873321533, "rewards/margins": 1.5693693161010742, "rewards/rejected": -3.2648377418518066, "sft_loss": 1.7231022119522095, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 10.660523203834975, "learning_rate": 3.36453205518783e-10, "logits/chosen": -0.14437521994113922, "logits/rejected": 0.060533732175827026, "logps/chosen": -1.7375590801239014, "logps/rejected": -3.7755045890808105, "loss": 0.4866, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.7375590801239014, "rewards/margins": 2.0379457473754883, "rewards/rejected": -3.7755045890808105, "sft_loss": 1.7783511877059937, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 14.052826810699122, "learning_rate": 2.447751110647989e-10, "logits/chosen": -0.13428905606269836, "logits/rejected": 0.12192927300930023, "logps/chosen": -1.6841440200805664, "logps/rejected": -3.4735703468322754, "loss": 0.5293, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.6841440200805664, "rewards/margins": 1.7894260883331299, "rewards/rejected": -3.4735703468322754, "sft_loss": 1.753156304359436, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 9.401172317085365, "learning_rate": 1.6764764838045342e-10, "logits/chosen": -0.278870165348053, "logits/rejected": 0.15991750359535217, "logps/chosen": -1.7664949893951416, "logps/rejected": -3.2208919525146484, "loss": 0.5423, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7664949893951416, "rewards/margins": 1.4543970823287964, "rewards/rejected": -3.2208919525146484, "sft_loss": 1.7974853515625, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 10.847683056633393, "learning_rate": 1.0507156575650934e-10, "logits/chosen": -0.20414504408836365, "logits/rejected": 0.07850085198879242, "logps/chosen": -1.7775157690048218, "logps/rejected": -3.567974090576172, "loss": 0.5171, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.7775157690048218, "rewards/margins": 1.790458083152771, "rewards/rejected": -3.567974090576172, "sft_loss": 1.896284818649292, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 9.824306185085138, "learning_rate": 5.7047470306659246e-11, "logits/chosen": -0.09952554851770401, "logits/rejected": -0.00851814728230238, "logps/chosen": -1.907403588294983, "logps/rejected": -3.7734904289245605, "loss": 0.554, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.907403588294983, "rewards/margins": 1.8660869598388672, "rewards/rejected": -3.7734904289245605, "sft_loss": 1.86178457736969, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 12.186981043972446, "learning_rate": 2.3575827960697906e-11, "logits/chosen": -0.13862931728363037, "logits/rejected": 0.1427539885044098, "logps/chosen": -1.7564480304718018, "logps/rejected": -3.5415992736816406, "loss": 0.5121, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.7564480304718018, "rewards/margins": 1.7851512432098389, "rewards/rejected": -3.5415992736816406, "sft_loss": 1.837898850440979, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 11.285735169160725, "learning_rate": 4.656963460691888e-12, "logits/chosen": -0.1521228700876236, "logits/rejected": 0.05595237761735916, "logps/chosen": -1.8109420537948608, "logps/rejected": -3.670588731765747, "loss": 0.5395, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.8109420537948608, "rewards/margins": 1.8596467971801758, "rewards/rejected": -3.670588731765747, "sft_loss": 1.8834893703460693, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": 0.4339616000652313, "eval_logits/rejected": 0.5812661647796631, "eval_logps/chosen": -2.0735538005828857, "eval_logps/rejected": -3.1333560943603516, "eval_loss": 0.7447991967201233, "eval_rewards/accuracies": 0.7069733142852783, "eval_rewards/chosen": -2.0735538005828857, "eval_rewards/margins": 1.0598026514053345, "eval_rewards/rejected": -3.1333560943603516, "eval_runtime": 47.8747, "eval_samples_per_second": 28.094, "eval_sft_loss": 1.9881486892700195, "eval_steps_per_second": 7.039, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 0.6688217556451066, "train_runtime": 34631.8364, "train_samples_per_second": 5.179, "train_steps_per_second": 0.162 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }