{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 4164, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007204610951008645, "grad_norm": 2.336021900177002, "learning_rate": 1.199040767386091e-10, "logits/chosen": -1.3860063552856445, "logits/rejected": -1.3949532508850098, "logps/chosen": -34.621925354003906, "logps/rejected": -37.30891418457031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.007204610951008645, "grad_norm": 2.7957100868225098, "learning_rate": 1.199040767386091e-09, "logits/chosen": -1.5467724800109863, "logits/rejected": -1.5282496213912964, "logps/chosen": -42.52306365966797, "logps/rejected": -44.5566520690918, "loss": 0.6932, "rewards/accuracies": 0.3611111044883728, "rewards/chosen": -9.029280045069754e-05, "rewards/margins": -2.0939776732120663e-05, "rewards/rejected": -6.935300189070404e-05, "step": 10 }, { "epoch": 0.01440922190201729, "grad_norm": 2.9324934482574463, "learning_rate": 2.398081534772182e-09, "logits/chosen": -1.555262804031372, "logits/rejected": -1.5412877798080444, "logps/chosen": -44.08427810668945, "logps/rejected": -46.5708122253418, "loss": 0.6933, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 1.1827131856989581e-05, "rewards/margins": -0.00023911210882943124, "rewards/rejected": 0.0002509392215870321, "step": 20 }, { "epoch": 0.021613832853025938, "grad_norm": 3.497398614883423, "learning_rate": 3.597122302158273e-09, "logits/chosen": -1.5116827487945557, "logits/rejected": -1.50448739528656, "logps/chosen": -47.85178756713867, "logps/rejected": -50.80080795288086, "loss": 0.6931, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -2.9822525903000496e-05, "rewards/margins": 6.526964716613293e-05, "rewards/rejected": -9.509220399195328e-05, "step": 30 }, { "epoch": 0.02881844380403458, "grad_norm": 2.5853052139282227, "learning_rate": 4.796163069544364e-09, "logits/chosen": -1.5581772327423096, "logits/rejected": -1.5541572570800781, "logps/chosen": -43.06446838378906, "logps/rejected": -45.565834045410156, "loss": 0.6931, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.00012712908210232854, "rewards/margins": 5.798434722237289e-05, "rewards/rejected": -0.00018511342932470143, "step": 40 }, { "epoch": 0.03602305475504323, "grad_norm": 2.652137041091919, "learning_rate": 5.995203836930456e-09, "logits/chosen": -1.4693658351898193, "logits/rejected": -1.4685413837432861, "logps/chosen": -43.009254455566406, "logps/rejected": -44.814476013183594, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": -2.008357478189282e-05, "rewards/margins": -0.0001420602493453771, "rewards/rejected": 0.00012197670002933592, "step": 50 }, { "epoch": 0.043227665706051875, "grad_norm": 3.9296295642852783, "learning_rate": 7.194244604316546e-09, "logits/chosen": -1.5675886869430542, "logits/rejected": -1.5608971118927002, "logps/chosen": -50.68689727783203, "logps/rejected": -52.0194206237793, "loss": 0.6932, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -2.8326398933131713e-06, "rewards/margins": -2.4851691705407575e-05, "rewards/rejected": 2.201905044785235e-05, "step": 60 }, { "epoch": 0.05043227665706052, "grad_norm": 2.2988357543945312, "learning_rate": 8.393285371702639e-09, "logits/chosen": -1.5360424518585205, "logits/rejected": -1.5283145904541016, "logps/chosen": -50.06494903564453, "logps/rejected": -52.77583694458008, "loss": 0.6931, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 3.0180139219737612e-05, "rewards/margins": 4.403649290907197e-05, "rewards/rejected": -1.3856357327313162e-05, "step": 70 }, { "epoch": 0.05763688760806916, "grad_norm": 3.470907211303711, "learning_rate": 9.592326139088728e-09, "logits/chosen": -1.5697886943817139, "logits/rejected": -1.5619677305221558, "logps/chosen": -51.1032600402832, "logps/rejected": -52.691810607910156, "loss": 0.6931, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 8.321718632942066e-05, "rewards/margins": 0.0001748651557136327, "rewards/rejected": -9.164798393612728e-05, "step": 80 }, { "epoch": 0.06484149855907781, "grad_norm": 2.783578872680664, "learning_rate": 1.0791366906474819e-08, "logits/chosen": -1.5033096075057983, "logits/rejected": -1.5004870891571045, "logps/chosen": -49.00882339477539, "logps/rejected": -51.163333892822266, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 7.983654359122738e-05, "rewards/margins": 7.151851605158299e-06, "rewards/rejected": 7.268470653798431e-05, "step": 90 }, { "epoch": 0.07204610951008646, "grad_norm": 3.002304792404175, "learning_rate": 1.1990407673860912e-08, "logits/chosen": -1.5840990543365479, "logits/rejected": -1.573439598083496, "logps/chosen": -45.69633483886719, "logps/rejected": -48.739601135253906, "loss": 0.6933, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -5.749738193117082e-05, "rewards/margins": -0.0002472072374075651, "rewards/rejected": 0.0001897098554763943, "step": 100 }, { "epoch": 0.0792507204610951, "grad_norm": 2.3070425987243652, "learning_rate": 1.3189448441247003e-08, "logits/chosen": -1.4549312591552734, "logits/rejected": -1.4311813116073608, "logps/chosen": -48.96226119995117, "logps/rejected": -51.16582489013672, "loss": 0.6933, "rewards/accuracies": 0.40625, "rewards/chosen": -0.00029989113681949675, "rewards/margins": -0.0003156957100145519, "rewards/rejected": 1.5804591384949163e-05, "step": 110 }, { "epoch": 0.08645533141210375, "grad_norm": 2.3011035919189453, "learning_rate": 1.4388489208633092e-08, "logits/chosen": -1.483244776725769, "logits/rejected": -1.4799872636795044, "logps/chosen": -44.255714416503906, "logps/rejected": -46.613468170166016, "loss": 0.6931, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.3762720957165584e-05, "rewards/margins": 0.00010847167868632823, "rewards/rejected": -0.0001322343887295574, "step": 120 }, { "epoch": 0.0936599423631124, "grad_norm": 3.2029104232788086, "learning_rate": 1.5587529976019183e-08, "logits/chosen": -1.5763423442840576, "logits/rejected": -1.5709102153778076, "logps/chosen": -49.423423767089844, "logps/rejected": -51.306007385253906, "loss": 0.6932, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.0001741496816975996, "rewards/margins": -0.00012099805462639779, "rewards/rejected": -5.31516270712018e-05, "step": 130 }, { "epoch": 0.10086455331412104, "grad_norm": 2.831598997116089, "learning_rate": 1.6786570743405277e-08, "logits/chosen": -1.4523359537124634, "logits/rejected": -1.4428811073303223, "logps/chosen": -45.845314025878906, "logps/rejected": -50.251155853271484, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 1.4822484445176087e-05, "rewards/margins": 5.557260010391474e-05, "rewards/rejected": -4.075012475368567e-05, "step": 140 }, { "epoch": 0.10806916426512968, "grad_norm": 3.5805459022521973, "learning_rate": 1.7985611510791365e-08, "logits/chosen": -1.4715862274169922, "logits/rejected": -1.464839220046997, "logps/chosen": -48.3131103515625, "logps/rejected": -51.4343147277832, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0001291931257583201, "rewards/margins": -0.00011216111306566745, "rewards/rejected": -1.7032030882546678e-05, "step": 150 }, { "epoch": 0.11527377521613832, "grad_norm": 2.45888090133667, "learning_rate": 1.9184652278177456e-08, "logits/chosen": -1.5041993856430054, "logits/rejected": -1.4858150482177734, "logps/chosen": -41.26018524169922, "logps/rejected": -44.56896209716797, "loss": 0.6933, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -5.096942186355591e-05, "rewards/margins": -0.00033451110357418656, "rewards/rejected": 0.0002835417108144611, "step": 160 }, { "epoch": 0.12247838616714697, "grad_norm": 3.1718838214874268, "learning_rate": 2.038369304556355e-08, "logits/chosen": -1.5168631076812744, "logits/rejected": -1.4979842901229858, "logps/chosen": -44.85700225830078, "logps/rejected": -46.86701583862305, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -3.738096711458638e-05, "rewards/margins": -0.00010323604510631412, "rewards/rejected": 6.585508526768535e-05, "step": 170 }, { "epoch": 0.12968299711815562, "grad_norm": 2.619306802749634, "learning_rate": 2.1582733812949638e-08, "logits/chosen": -1.5800260305404663, "logits/rejected": -1.5681655406951904, "logps/chosen": -45.09278106689453, "logps/rejected": -46.82966995239258, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -7.130586709536146e-06, "rewards/margins": -0.00016551685985177755, "rewards/rejected": 0.00015838624676689506, "step": 180 }, { "epoch": 0.13688760806916425, "grad_norm": 2.9155962467193604, "learning_rate": 2.278177458033573e-08, "logits/chosen": -1.5894930362701416, "logits/rejected": -1.5857088565826416, "logps/chosen": -42.25670623779297, "logps/rejected": -45.39856719970703, "loss": 0.6932, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": 5.098475230624899e-05, "rewards/margins": -8.445042112725787e-06, "rewards/rejected": 5.9429770772112533e-05, "step": 190 }, { "epoch": 0.1440922190201729, "grad_norm": 3.554685354232788, "learning_rate": 2.3980815347721823e-08, "logits/chosen": -1.5361117124557495, "logits/rejected": -1.5294668674468994, "logps/chosen": -43.47795486450195, "logps/rejected": -47.10270309448242, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": 5.860309465788305e-05, "rewards/margins": -0.0001218240795424208, "rewards/rejected": 0.00018042718875221908, "step": 200 }, { "epoch": 0.15129682997118155, "grad_norm": 3.053459405899048, "learning_rate": 2.517985611510791e-08, "logits/chosen": -1.565314531326294, "logits/rejected": -1.554024338722229, "logps/chosen": -43.0426025390625, "logps/rejected": -43.413352966308594, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -5.537036486202851e-05, "rewards/margins": 6.648890121141449e-05, "rewards/rejected": -0.00012185927334940061, "step": 210 }, { "epoch": 0.1585014409221902, "grad_norm": 2.835756301879883, "learning_rate": 2.6378896882494006e-08, "logits/chosen": -1.4806112051010132, "logits/rejected": -1.473852515220642, "logps/chosen": -47.3744010925293, "logps/rejected": -52.52124786376953, "loss": 0.6933, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.00017219179426319897, "rewards/margins": -0.0003234629984945059, "rewards/rejected": 0.00015127118967939168, "step": 220 }, { "epoch": 0.16570605187319884, "grad_norm": 2.5750908851623535, "learning_rate": 2.7577937649880097e-08, "logits/chosen": -1.5319396257400513, "logits/rejected": -1.530428171157837, "logps/chosen": -44.463134765625, "logps/rejected": -48.242122650146484, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 6.716211646562442e-05, "rewards/margins": -5.54590940282651e-07, "rewards/rejected": 6.771668267901987e-05, "step": 230 }, { "epoch": 0.1729106628242075, "grad_norm": 3.010782480239868, "learning_rate": 2.8776978417266184e-08, "logits/chosen": -1.5795023441314697, "logits/rejected": -1.5688108205795288, "logps/chosen": -49.142967224121094, "logps/rejected": -51.20656204223633, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": -1.059528767655138e-05, "rewards/margins": 1.4627486052631866e-05, "rewards/rejected": -2.5222787371603772e-05, "step": 240 }, { "epoch": 0.18011527377521613, "grad_norm": 4.015756130218506, "learning_rate": 2.997601918465228e-08, "logits/chosen": -1.4548447132110596, "logits/rejected": -1.4441945552825928, "logps/chosen": -49.71647644042969, "logps/rejected": -50.771690368652344, "loss": 0.6931, "rewards/accuracies": 0.46875, "rewards/chosen": -8.51520017022267e-05, "rewards/margins": 1.9873681594617665e-05, "rewards/rejected": -0.00010502567602088675, "step": 250 }, { "epoch": 0.1873198847262248, "grad_norm": 3.1907923221588135, "learning_rate": 3.1175059952038366e-08, "logits/chosen": -1.4959896802902222, "logits/rejected": -1.4902995824813843, "logps/chosen": -49.98474884033203, "logps/rejected": -51.45148468017578, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0002439332165522501, "rewards/margins": 0.00025814835680648685, "rewards/rejected": -1.4215113878890406e-05, "step": 260 }, { "epoch": 0.19452449567723343, "grad_norm": 3.32533860206604, "learning_rate": 3.237410071942446e-08, "logits/chosen": -1.5843312740325928, "logits/rejected": -1.5691900253295898, "logps/chosen": -46.75273132324219, "logps/rejected": -49.31193542480469, "loss": 0.6931, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 0.00015570121468044817, "rewards/margins": 3.353106149006635e-05, "rewards/rejected": 0.00012217015319038182, "step": 270 }, { "epoch": 0.2017291066282421, "grad_norm": 2.922327756881714, "learning_rate": 3.3573141486810555e-08, "logits/chosen": -1.5491392612457275, "logits/rejected": -1.5333701372146606, "logps/chosen": -42.477779388427734, "logps/rejected": -43.71261978149414, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -7.194602221716195e-05, "rewards/margins": -0.00013697049871552736, "rewards/rejected": 6.502445467049256e-05, "step": 280 }, { "epoch": 0.20893371757925072, "grad_norm": 2.781327724456787, "learning_rate": 3.477218225419664e-08, "logits/chosen": -1.5796802043914795, "logits/rejected": -1.5691970586776733, "logps/chosen": -44.49600601196289, "logps/rejected": -45.78491973876953, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00016616811626590788, "rewards/margins": 0.0001632832718314603, "rewards/rejected": 2.884840114347753e-06, "step": 290 }, { "epoch": 0.21613832853025935, "grad_norm": 3.15329909324646, "learning_rate": 3.597122302158273e-08, "logits/chosen": -1.5556309223175049, "logits/rejected": -1.5389394760131836, "logps/chosen": -48.224769592285156, "logps/rejected": -50.68939971923828, "loss": 0.6933, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.0002144512691302225, "rewards/margins": -0.00030510686337947845, "rewards/rejected": 9.065552876563743e-05, "step": 300 }, { "epoch": 0.22334293948126802, "grad_norm": 3.011691093444824, "learning_rate": 3.717026378896883e-08, "logits/chosen": -1.4414619207382202, "logits/rejected": -1.4318530559539795, "logps/chosen": -48.020721435546875, "logps/rejected": -50.055450439453125, "loss": 0.6931, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.00020195532124489546, "rewards/margins": 0.00012811608030460775, "rewards/rejected": 7.383924094028771e-05, "step": 310 }, { "epoch": 0.23054755043227665, "grad_norm": 2.5438578128814697, "learning_rate": 3.836930455635491e-08, "logits/chosen": -1.5534722805023193, "logits/rejected": -1.5385301113128662, "logps/chosen": -47.08538055419922, "logps/rejected": -51.54567337036133, "loss": 0.6931, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.0002954248629976064, "rewards/margins": 9.292210597777739e-05, "rewards/rejected": 0.0002025027060881257, "step": 320 }, { "epoch": 0.2377521613832853, "grad_norm": 2.316570520401001, "learning_rate": 3.9568345323741003e-08, "logits/chosen": -1.5250051021575928, "logits/rejected": -1.5185617208480835, "logps/chosen": -50.739192962646484, "logps/rejected": -49.54676055908203, "loss": 0.6931, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 0.00017470329476054758, "rewards/margins": -3.749446477741003e-07, "rewards/rejected": 0.00017507823940832168, "step": 330 }, { "epoch": 0.24495677233429394, "grad_norm": 2.6795296669006348, "learning_rate": 4.07673860911271e-08, "logits/chosen": -1.5784777402877808, "logits/rejected": -1.5679104328155518, "logps/chosen": -51.091636657714844, "logps/rejected": -52.1263542175293, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 5.405420597526245e-05, "rewards/margins": 0.00031650811433792114, "rewards/rejected": -0.0002624538610689342, "step": 340 }, { "epoch": 0.2521613832853026, "grad_norm": 3.6704699993133545, "learning_rate": 4.1966426858513185e-08, "logits/chosen": -1.5095831155776978, "logits/rejected": -1.504861831665039, "logps/chosen": -45.61798858642578, "logps/rejected": -48.589141845703125, "loss": 0.6929, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.000504065363202244, "rewards/margins": 0.00041280948789790273, "rewards/rejected": 9.125589713221416e-05, "step": 350 }, { "epoch": 0.25936599423631124, "grad_norm": 3.4911813735961914, "learning_rate": 4.3165467625899276e-08, "logits/chosen": -1.501556634902954, "logits/rejected": -1.4911236763000488, "logps/chosen": -53.90102005004883, "logps/rejected": -56.54648971557617, "loss": 0.693, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.00031928817043080926, "rewards/margins": 0.00022553373128175735, "rewards/rejected": 9.375448280479759e-05, "step": 360 }, { "epoch": 0.2665706051873199, "grad_norm": 3.746173143386841, "learning_rate": 4.4364508393285374e-08, "logits/chosen": -1.482246994972229, "logits/rejected": -1.4804664850234985, "logps/chosen": -48.317405700683594, "logps/rejected": -53.011993408203125, "loss": 0.6931, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.00026067602448165417, "rewards/margins": 0.0001698151318123564, "rewards/rejected": 9.08608635654673e-05, "step": 370 }, { "epoch": 0.2737752161383285, "grad_norm": 2.481205701828003, "learning_rate": 4.556354916067146e-08, "logits/chosen": -1.5724612474441528, "logits/rejected": -1.5653808116912842, "logps/chosen": -47.00031661987305, "logps/rejected": -48.018943786621094, "loss": 0.6933, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 3.1971517273632344e-06, "rewards/margins": -0.0002721514902077615, "rewards/rejected": 0.0002753486332949251, "step": 380 }, { "epoch": 0.28097982708933716, "grad_norm": 2.9210126399993896, "learning_rate": 4.676258992805755e-08, "logits/chosen": -1.54493248462677, "logits/rejected": -1.5351022481918335, "logps/chosen": -48.16374969482422, "logps/rejected": -51.50679397583008, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": 0.000324452732456848, "rewards/margins": 0.0003632038424257189, "rewards/rejected": -3.8751088140998036e-05, "step": 390 }, { "epoch": 0.2881844380403458, "grad_norm": 3.4040896892547607, "learning_rate": 4.796163069544365e-08, "logits/chosen": -1.5583593845367432, "logits/rejected": -1.5532950162887573, "logps/chosen": -44.89096450805664, "logps/rejected": -46.113468170166016, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0001240587153006345, "rewards/margins": -4.035348320030607e-05, "rewards/rejected": 0.00016441218031104654, "step": 400 }, { "epoch": 0.2953890489913545, "grad_norm": 3.631049871444702, "learning_rate": 4.916067146282973e-08, "logits/chosen": -1.5089349746704102, "logits/rejected": -1.5059764385223389, "logps/chosen": -47.52130889892578, "logps/rejected": -49.62239456176758, "loss": 0.6932, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -2.1968793589621782e-05, "rewards/margins": -3.3516105759190395e-05, "rewards/rejected": 1.1547293979674578e-05, "step": 410 }, { "epoch": 0.3025936599423631, "grad_norm": 2.4457123279571533, "learning_rate": 4.999992091672379e-08, "logits/chosen": -1.4693686962127686, "logits/rejected": -1.4796515703201294, "logps/chosen": -45.61933135986328, "logps/rejected": -48.9610710144043, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0004673867952078581, "rewards/margins": 0.000354716379661113, "rewards/rejected": 0.0001126704373746179, "step": 420 }, { "epoch": 0.30979827089337175, "grad_norm": 2.2836077213287354, "learning_rate": 4.999851500573209e-08, "logits/chosen": -1.4973236322402954, "logits/rejected": -1.4976922273635864, "logps/chosen": -46.055450439453125, "logps/rejected": -46.19012451171875, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00042625871719792485, "rewards/margins": 0.0001353234110865742, "rewards/rejected": 0.00029093524790368974, "step": 430 }, { "epoch": 0.3170028818443804, "grad_norm": 2.460236072540283, "learning_rate": 4.999535180235972e-08, "logits/chosen": -1.498327612876892, "logits/rejected": -1.4901635646820068, "logps/chosen": -46.00019454956055, "logps/rejected": -49.427894592285156, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.00046957674203440547, "rewards/margins": 0.0002196793502662331, "rewards/rejected": 0.00024989742087200284, "step": 440 }, { "epoch": 0.3242074927953891, "grad_norm": 3.161519765853882, "learning_rate": 4.9990431528966836e-08, "logits/chosen": -1.5111868381500244, "logits/rejected": -1.4901098012924194, "logps/chosen": -53.20442581176758, "logps/rejected": -51.29274368286133, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.00018602630007080734, "rewards/margins": 0.0001877523900475353, "rewards/rejected": -1.7260724689549534e-06, "step": 450 }, { "epoch": 0.3314121037463977, "grad_norm": 3.7941031455993652, "learning_rate": 4.9983754531428326e-08, "logits/chosen": -1.5165042877197266, "logits/rejected": -1.4989763498306274, "logps/chosen": -53.773658752441406, "logps/rejected": -55.66585159301758, "loss": 0.6928, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0006648501148447394, "rewards/margins": 0.0006117599550634623, "rewards/rejected": 5.309013067744672e-05, "step": 460 }, { "epoch": 0.33861671469740634, "grad_norm": 3.734304189682007, "learning_rate": 4.997532127910954e-08, "logits/chosen": -1.5783549547195435, "logits/rejected": -1.5489213466644287, "logps/chosen": -52.63933563232422, "logps/rejected": -53.201080322265625, "loss": 0.693, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0004433690046425909, "rewards/margins": 0.0003627253754530102, "rewards/rejected": 8.064367284532636e-05, "step": 470 }, { "epoch": 0.345821325648415, "grad_norm": 3.6867148876190186, "learning_rate": 4.996513236483331e-08, "logits/chosen": -1.6470205783843994, "logits/rejected": -1.6330623626708984, "logps/chosen": -42.520477294921875, "logps/rejected": -45.42660903930664, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00032508961157873273, "rewards/margins": 0.00028424913762137294, "rewards/rejected": 4.084048487129621e-05, "step": 480 }, { "epoch": 0.3530259365994236, "grad_norm": 4.27908992767334, "learning_rate": 4.9953188504838225e-08, "logits/chosen": -1.5245378017425537, "logits/rejected": -1.5128180980682373, "logps/chosen": -46.4404411315918, "logps/rejected": -49.49384689331055, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0004937785561196506, "rewards/margins": 1.8969347365782596e-05, "rewards/rejected": 0.0004748092032968998, "step": 490 }, { "epoch": 0.36023054755043227, "grad_norm": 2.822087049484253, "learning_rate": 4.993949053872834e-08, "logits/chosen": -1.5284955501556396, "logits/rejected": -1.5052189826965332, "logps/chosen": -42.6441535949707, "logps/rejected": -45.895713806152344, "loss": 0.6929, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0006938868900761008, "rewards/margins": 0.000535003375262022, "rewards/rejected": 0.00015888357302173972, "step": 500 }, { "epoch": 0.36743515850144093, "grad_norm": 2.8799259662628174, "learning_rate": 4.9924039429414086e-08, "logits/chosen": -1.639021873474121, "logits/rejected": -1.6214443445205688, "logps/chosen": -46.00009536743164, "logps/rejected": -47.9588508605957, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0007414157735183835, "rewards/margins": 0.00037929159589111805, "rewards/rejected": 0.00036212411941960454, "step": 510 }, { "epoch": 0.3746397694524496, "grad_norm": 3.5472185611724854, "learning_rate": 4.990683626304467e-08, "logits/chosen": -1.53446364402771, "logits/rejected": -1.5293452739715576, "logps/chosen": -53.89451217651367, "logps/rejected": -56.04365921020508, "loss": 0.6929, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0006791441701352596, "rewards/margins": 0.0004991428577341139, "rewards/rejected": 0.00018000123964156955, "step": 520 }, { "epoch": 0.3818443804034582, "grad_norm": 3.2031264305114746, "learning_rate": 4.9887882248931646e-08, "logits/chosen": -1.4586400985717773, "logits/rejected": -1.4377648830413818, "logps/chosen": -46.408958435058594, "logps/rejected": -47.54657745361328, "loss": 0.6929, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0010369222145527601, "rewards/margins": 0.0005174219841137528, "rewards/rejected": 0.0005195002304390073, "step": 530 }, { "epoch": 0.38904899135446686, "grad_norm": 3.1673731803894043, "learning_rate": 4.986717871946393e-08, "logits/chosen": -1.485285997390747, "logits/rejected": -1.464300274848938, "logps/chosen": -45.8732795715332, "logps/rejected": -47.80138397216797, "loss": 0.6928, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0009578888420946896, "rewards/margins": 0.0006758830859325826, "rewards/rejected": 0.00028200578526593745, "step": 540 }, { "epoch": 0.3962536023054755, "grad_norm": 3.0882983207702637, "learning_rate": 4.984472713001416e-08, "logits/chosen": -1.4302603006362915, "logits/rejected": -1.421942949295044, "logps/chosen": -48.35163879394531, "logps/rejected": -48.36573028564453, "loss": 0.6928, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0009299576049670577, "rewards/margins": 0.0007230864139273763, "rewards/rejected": 0.00020687119103968143, "step": 550 }, { "epoch": 0.4034582132564842, "grad_norm": 3.2402331829071045, "learning_rate": 4.982052905883637e-08, "logits/chosen": -1.573286533355713, "logits/rejected": -1.5629457235336304, "logps/chosen": -48.487220764160156, "logps/rejected": -49.93341827392578, "loss": 0.6929, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0006783484714105725, "rewards/margins": 0.000492787454277277, "rewards/rejected": 0.00018556095892563462, "step": 560 }, { "epoch": 0.4106628242074928, "grad_norm": 2.9258902072906494, "learning_rate": 4.979458620695505e-08, "logits/chosen": -1.5526586771011353, "logits/rejected": -1.5234354734420776, "logps/chosen": -52.4578857421875, "logps/rejected": -54.48802947998047, "loss": 0.6927, "rewards/accuracies": 0.625, "rewards/chosen": 0.0012387813767418265, "rewards/margins": 0.0009542113984934986, "rewards/rejected": 0.0002845699491444975, "step": 570 }, { "epoch": 0.41786743515850144, "grad_norm": 3.176051139831543, "learning_rate": 4.976690039804555e-08, "logits/chosen": -1.5767595767974854, "logits/rejected": -1.563197374343872, "logps/chosen": -42.638832092285156, "logps/rejected": -44.07222366333008, "loss": 0.693, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0005800885264761746, "rewards/margins": 0.0003046609926968813, "rewards/rejected": 0.00027542750467546284, "step": 580 }, { "epoch": 0.4250720461095101, "grad_norm": 2.7319722175598145, "learning_rate": 4.973747357830592e-08, "logits/chosen": -1.5270984172821045, "logits/rejected": -1.525622010231018, "logps/chosen": -47.51670837402344, "logps/rejected": -53.14258575439453, "loss": 0.6927, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0010576354106888175, "rewards/margins": 0.0008552650106139481, "rewards/rejected": 0.0002023702982114628, "step": 590 }, { "epoch": 0.4322766570605187, "grad_norm": 2.7828357219696045, "learning_rate": 4.970630781632009e-08, "logits/chosen": -1.6297931671142578, "logits/rejected": -1.6194136142730713, "logps/chosen": -45.408538818359375, "logps/rejected": -49.090171813964844, "loss": 0.6927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0008878445369191468, "rewards/margins": 0.0009432813385501504, "rewards/rejected": -5.54367907170672e-05, "step": 600 }, { "epoch": 0.43948126801152737, "grad_norm": 3.927708148956299, "learning_rate": 4.967340530291242e-08, "logits/chosen": -1.534347414970398, "logits/rejected": -1.5172450542449951, "logps/chosen": -50.444236755371094, "logps/rejected": -51.08340835571289, "loss": 0.6929, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0008061706321313977, "rewards/margins": 0.00048699689796194434, "rewards/rejected": 0.00031917367596179247, "step": 610 }, { "epoch": 0.44668587896253603, "grad_norm": 2.691185235977173, "learning_rate": 4.9638768350993755e-08, "logits/chosen": -1.5679445266723633, "logits/rejected": -1.553625464439392, "logps/chosen": -42.3968505859375, "logps/rejected": -44.43559265136719, "loss": 0.693, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.0008928319439291954, "rewards/margins": 0.0002728732652030885, "rewards/rejected": 0.000619958620518446, "step": 620 }, { "epoch": 0.4538904899135447, "grad_norm": 2.364649534225464, "learning_rate": 4.9602399395398786e-08, "logits/chosen": -1.570885419845581, "logits/rejected": -1.5635267496109009, "logps/chosen": -43.05287551879883, "logps/rejected": -46.550086975097656, "loss": 0.6928, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0010358591098338366, "rewards/margins": 0.000793623854406178, "rewards/rejected": 0.00024223529908340424, "step": 630 }, { "epoch": 0.4610951008645533, "grad_norm": 2.915830612182617, "learning_rate": 4.9564300992714914e-08, "logits/chosen": -1.428993821144104, "logits/rejected": -1.4243650436401367, "logps/chosen": -45.36574935913086, "logps/rejected": -48.004005432128906, "loss": 0.6927, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.00126446015201509, "rewards/margins": 0.0009202055516652763, "rewards/rejected": 0.00034425462945364416, "step": 640 }, { "epoch": 0.46829971181556196, "grad_norm": 3.4686129093170166, "learning_rate": 4.952447582110253e-08, "logits/chosen": -1.6131556034088135, "logits/rejected": -1.5842258930206299, "logps/chosen": -45.44933319091797, "logps/rejected": -45.383323669433594, "loss": 0.6929, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.001398588763549924, "rewards/margins": 0.0005807363195344806, "rewards/rejected": 0.0008178524440154433, "step": 650 }, { "epoch": 0.4755043227665706, "grad_norm": 3.4266490936279297, "learning_rate": 4.948292668010676e-08, "logits/chosen": -1.5424816608428955, "logits/rejected": -1.54043447971344, "logps/chosen": -47.142127990722656, "logps/rejected": -50.01351547241211, "loss": 0.6926, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0012382261920720339, "rewards/margins": 0.0010371087118983269, "rewards/rejected": 0.0002011175238294527, "step": 660 }, { "epoch": 0.4827089337175792, "grad_norm": 3.418168306350708, "learning_rate": 4.943965649046064e-08, "logits/chosen": -1.5016462802886963, "logits/rejected": -1.4743115901947021, "logps/chosen": -49.834571838378906, "logps/rejected": -51.15303039550781, "loss": 0.6929, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.000952904112637043, "rewards/margins": 0.0004903805674985051, "rewards/rejected": 0.00046252348693087697, "step": 670 }, { "epoch": 0.4899135446685879, "grad_norm": 4.634679794311523, "learning_rate": 4.9394668293879835e-08, "logits/chosen": -1.4445552825927734, "logits/rejected": -1.4312325716018677, "logps/chosen": -49.70585250854492, "logps/rejected": -49.58285903930664, "loss": 0.6926, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0017881987150758505, "rewards/margins": 0.0011380156502127647, "rewards/rejected": 0.0006501831230707467, "step": 680 }, { "epoch": 0.49711815561959655, "grad_norm": 3.3351666927337646, "learning_rate": 4.93479652528488e-08, "logits/chosen": -1.5312552452087402, "logits/rejected": -1.5206135511398315, "logps/chosen": -47.807212829589844, "logps/rejected": -50.62922286987305, "loss": 0.6926, "rewards/accuracies": 0.625, "rewards/chosen": 0.0014403645182028413, "rewards/margins": 0.0011131230276077986, "rewards/rejected": 0.00032724151969887316, "step": 690 }, { "epoch": 0.5043227665706052, "grad_norm": 2.796025276184082, "learning_rate": 4.929955065039848e-08, "logits/chosen": -1.5448652505874634, "logits/rejected": -1.5314280986785889, "logps/chosen": -46.463924407958984, "logps/rejected": -49.261375427246094, "loss": 0.6927, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0013490957207977772, "rewards/margins": 0.000978103606030345, "rewards/rejected": 0.0003709921729750931, "step": 700 }, { "epoch": 0.5115273775216138, "grad_norm": 2.809128761291504, "learning_rate": 4.92494278898755e-08, "logits/chosen": -1.525039792060852, "logits/rejected": -1.5088342428207397, "logps/chosen": -41.34796905517578, "logps/rejected": -43.392948150634766, "loss": 0.6927, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0013073389418423176, "rewards/margins": 0.0009176974999718368, "rewards/rejected": 0.0003896414418704808, "step": 710 }, { "epoch": 0.5187319884726225, "grad_norm": 3.328831911087036, "learning_rate": 4.9197600494702955e-08, "logits/chosen": -1.4957438707351685, "logits/rejected": -1.4807456731796265, "logps/chosen": -49.30406951904297, "logps/rejected": -52.457130432128906, "loss": 0.6927, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.001288148807361722, "rewards/margins": 0.0009355359943583608, "rewards/rejected": 0.0003526128421071917, "step": 720 }, { "epoch": 0.5259365994236311, "grad_norm": 2.870708465576172, "learning_rate": 4.9144072108132725e-08, "logits/chosen": -1.5102272033691406, "logits/rejected": -1.490850806236267, "logps/chosen": -48.92375946044922, "logps/rejected": -51.040565490722656, "loss": 0.6928, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0015529112424701452, "rewards/margins": 0.0007295840186998248, "rewards/rejected": 0.0008233273401856422, "step": 730 }, { "epoch": 0.5331412103746398, "grad_norm": 2.905596971511841, "learning_rate": 4.908884649298937e-08, "logits/chosen": -1.5039650201797485, "logits/rejected": -1.500274419784546, "logps/chosen": -46.73409652709961, "logps/rejected": -46.288230895996094, "loss": 0.6929, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0010712891817092896, "rewards/margins": 0.0005624311743304133, "rewards/rejected": 0.000508858182001859, "step": 740 }, { "epoch": 0.5403458213256485, "grad_norm": 2.8765416145324707, "learning_rate": 4.903192753140557e-08, "logits/chosen": -1.5269978046417236, "logits/rejected": -1.5104032754898071, "logps/chosen": -48.90652084350586, "logps/rejected": -50.098411560058594, "loss": 0.6924, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.001532660680823028, "rewards/margins": 0.0014079209649935365, "rewards/rejected": 0.0001247395994141698, "step": 750 }, { "epoch": 0.547550432276657, "grad_norm": 3.3323886394500732, "learning_rate": 4.897331922454931e-08, "logits/chosen": -1.453380823135376, "logits/rejected": -1.4518440961837769, "logps/chosen": -45.53496551513672, "logps/rejected": -48.62436294555664, "loss": 0.6927, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0013867730740457773, "rewards/margins": 0.000980939483270049, "rewards/rejected": 0.0004058335907757282, "step": 760 }, { "epoch": 0.5547550432276657, "grad_norm": 3.25243878364563, "learning_rate": 4.891302569234256e-08, "logits/chosen": -1.4739805459976196, "logits/rejected": -1.467893362045288, "logps/chosen": -43.22185134887695, "logps/rejected": -45.91840744018555, "loss": 0.6923, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0018970195669680834, "rewards/margins": 0.001757220714353025, "rewards/rejected": 0.00013979877985548228, "step": 770 }, { "epoch": 0.5619596541786743, "grad_norm": 2.809793710708618, "learning_rate": 4.8851051173171656e-08, "logits/chosen": -1.4990657567977905, "logits/rejected": -1.4895389080047607, "logps/chosen": -48.396583557128906, "logps/rejected": -50.17659378051758, "loss": 0.6926, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0017727743834257126, "rewards/margins": 0.0011530198389664292, "rewards/rejected": 0.0006197548937052488, "step": 780 }, { "epoch": 0.569164265129683, "grad_norm": 2.892955780029297, "learning_rate": 4.87874000235894e-08, "logits/chosen": -1.5470690727233887, "logits/rejected": -1.5371184349060059, "logps/chosen": -49.8763542175293, "logps/rejected": -53.440277099609375, "loss": 0.6924, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0020356441382318735, "rewards/margins": 0.001580337411724031, "rewards/rejected": 0.0004553066974040121, "step": 790 }, { "epoch": 0.5763688760806917, "grad_norm": 3.3299455642700195, "learning_rate": 4.872207671800876e-08, "logits/chosen": -1.5248336791992188, "logits/rejected": -1.5134174823760986, "logps/chosen": -46.861637115478516, "logps/rejected": -47.89669418334961, "loss": 0.6924, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0019999851938337088, "rewards/margins": 0.0015228716656565666, "rewards/rejected": 0.0004771137028001249, "step": 800 }, { "epoch": 0.5835734870317003, "grad_norm": 2.6876394748687744, "learning_rate": 4.865508584838841e-08, "logits/chosen": -1.5174375772476196, "logits/rejected": -1.5208656787872314, "logps/chosen": -44.744140625, "logps/rejected": -47.87763214111328, "loss": 0.6926, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00140316691249609, "rewards/margins": 0.0010685885790735483, "rewards/rejected": 0.000334578420734033, "step": 810 }, { "epoch": 0.590778097982709, "grad_norm": 2.7165634632110596, "learning_rate": 4.858643212390985e-08, "logits/chosen": -1.5524417161941528, "logits/rejected": -1.5306005477905273, "logps/chosen": -46.926429748535156, "logps/rejected": -47.55287170410156, "loss": 0.6924, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0020128923933953047, "rewards/margins": 0.001531012705527246, "rewards/rejected": 0.0004818797460757196, "step": 820 }, { "epoch": 0.5979827089337176, "grad_norm": 2.6394128799438477, "learning_rate": 4.851612037064643e-08, "logits/chosen": -1.510181188583374, "logits/rejected": -1.5031936168670654, "logps/chosen": -41.7721061706543, "logps/rejected": -44.63286590576172, "loss": 0.6923, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0025131129659712315, "rewards/margins": 0.0017323486972600222, "rewards/rejected": 0.000780764443334192, "step": 830 }, { "epoch": 0.6051873198847262, "grad_norm": 2.2611138820648193, "learning_rate": 4.8444155531224065e-08, "logits/chosen": -1.519513487815857, "logits/rejected": -1.5121686458587646, "logps/chosen": -47.21880340576172, "logps/rejected": -47.49650955200195, "loss": 0.6923, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0021171828266233206, "rewards/margins": 0.0016254640650004148, "rewards/rejected": 0.0004917188780382276, "step": 840 }, { "epoch": 0.6123919308357348, "grad_norm": 3.8291754722595215, "learning_rate": 4.8370542664473805e-08, "logits/chosen": -1.52823805809021, "logits/rejected": -1.5173234939575195, "logps/chosen": -47.172088623046875, "logps/rejected": -50.45137023925781, "loss": 0.6922, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0022985998075455427, "rewards/margins": 0.0019912240095436573, "rewards/rejected": 0.00030737603083252907, "step": 850 }, { "epoch": 0.6195965417867435, "grad_norm": 2.788311719894409, "learning_rate": 4.829528694507624e-08, "logits/chosen": -1.5345584154129028, "logits/rejected": -1.5193954706192017, "logps/chosen": -56.846717834472656, "logps/rejected": -56.777488708496094, "loss": 0.6922, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.002310269046574831, "rewards/margins": 0.0019274738151580095, "rewards/rejected": 0.0003827956388704479, "step": 860 }, { "epoch": 0.6268011527377522, "grad_norm": 3.0898354053497314, "learning_rate": 4.821839366319768e-08, "logits/chosen": -1.5738890171051025, "logits/rejected": -1.5631177425384521, "logps/chosen": -47.592525482177734, "logps/rejected": -50.608699798583984, "loss": 0.6922, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.001986520830541849, "rewards/margins": 0.0018442177679389715, "rewards/rejected": 0.00014230303349904716, "step": 870 }, { "epoch": 0.6340057636887608, "grad_norm": 3.0527920722961426, "learning_rate": 4.813986822411833e-08, "logits/chosen": -1.594681978225708, "logits/rejected": -1.586861491203308, "logps/chosen": -46.469261169433594, "logps/rejected": -47.6103630065918, "loss": 0.6924, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0018088910728693008, "rewards/margins": 0.0015493319369852543, "rewards/rejected": 0.0002595590485725552, "step": 880 }, { "epoch": 0.6412103746397695, "grad_norm": 2.957890748977661, "learning_rate": 4.805971614785231e-08, "logits/chosen": -1.5932258367538452, "logits/rejected": -1.5828759670257568, "logps/chosen": -44.192230224609375, "logps/rejected": -45.84956741333008, "loss": 0.6922, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.002317091915756464, "rewards/margins": 0.0019270635675638914, "rewards/rejected": 0.0003900282899849117, "step": 890 }, { "epoch": 0.6484149855907781, "grad_norm": 3.1973183155059814, "learning_rate": 4.797794306875963e-08, "logits/chosen": -1.4426988363265991, "logits/rejected": -1.4457299709320068, "logps/chosen": -52.90196990966797, "logps/rejected": -56.030479431152344, "loss": 0.6924, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0017495963256806135, "rewards/margins": 0.0014681669417768717, "rewards/rejected": 0.0002814295585267246, "step": 900 }, { "epoch": 0.6556195965417867, "grad_norm": 3.1332459449768066, "learning_rate": 4.7894554735150076e-08, "logits/chosen": -1.4939850568771362, "logits/rejected": -1.486452579498291, "logps/chosen": -50.417274475097656, "logps/rejected": -51.93208694458008, "loss": 0.6925, "rewards/accuracies": 0.5625, "rewards/chosen": 0.001817676005885005, "rewards/margins": 0.001303942990489304, "rewards/rejected": 0.0005137331318110228, "step": 910 }, { "epoch": 0.6628242074927954, "grad_norm": 2.500265598297119, "learning_rate": 4.7809557008879185e-08, "logits/chosen": -1.5261718034744263, "logits/rejected": -1.5143718719482422, "logps/chosen": -42.04478073120117, "logps/rejected": -43.99352264404297, "loss": 0.6919, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0026101425755769014, "rewards/margins": 0.0025680093094706535, "rewards/rejected": 4.213328429614194e-05, "step": 920 }, { "epoch": 0.670028818443804, "grad_norm": 3.3175556659698486, "learning_rate": 4.772295586493613e-08, "logits/chosen": -1.5920884609222412, "logits/rejected": -1.5787022113800049, "logps/chosen": -46.341896057128906, "logps/rejected": -48.80807113647461, "loss": 0.6921, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.002759728580713272, "rewards/margins": 0.0021550802048295736, "rewards/rejected": 0.0006046487251296639, "step": 930 }, { "epoch": 0.6772334293948127, "grad_norm": 2.305763006210327, "learning_rate": 4.763475739102374e-08, "logits/chosen": -1.473327398300171, "logits/rejected": -1.468834400177002, "logps/chosen": -54.86418914794922, "logps/rejected": -55.64992141723633, "loss": 0.692, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0026435963809490204, "rewards/margins": 0.002390109235420823, "rewards/rejected": 0.0002534873492550105, "step": 940 }, { "epoch": 0.6844380403458213, "grad_norm": 2.9104561805725098, "learning_rate": 4.754496778713054e-08, "logits/chosen": -1.4290201663970947, "logits/rejected": -1.442920446395874, "logps/chosen": -46.24345397949219, "logps/rejected": -50.873817443847656, "loss": 0.6924, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.002895154058933258, "rewards/margins": 0.0014690979151055217, "rewards/rejected": 0.0014260562602430582, "step": 950 }, { "epoch": 0.69164265129683, "grad_norm": 2.9715263843536377, "learning_rate": 4.7453593365094926e-08, "logits/chosen": -1.564841866493225, "logits/rejected": -1.5565474033355713, "logps/chosen": -48.927528381347656, "logps/rejected": -51.33686065673828, "loss": 0.6923, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0019558072090148926, "rewards/margins": 0.001724687055684626, "rewards/rejected": 0.00023112029884941876, "step": 960 }, { "epoch": 0.6988472622478387, "grad_norm": 3.6742053031921387, "learning_rate": 4.736064054816145e-08, "logits/chosen": -1.5793306827545166, "logits/rejected": -1.5710303783416748, "logps/chosen": -44.4240837097168, "logps/rejected": -47.71710968017578, "loss": 0.6918, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.003227539826184511, "rewards/margins": 0.0027411046903580427, "rewards/rejected": 0.00048643528134562075, "step": 970 }, { "epoch": 0.7060518731988472, "grad_norm": 2.770151376724243, "learning_rate": 4.726611587052933e-08, "logits/chosen": -1.430496335029602, "logits/rejected": -1.4332740306854248, "logps/chosen": -50.69718551635742, "logps/rejected": -55.78377151489258, "loss": 0.6925, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.002387463580816984, "rewards/margins": 0.0013439650647342205, "rewards/rejected": 0.00104349828325212, "step": 980 }, { "epoch": 0.7132564841498559, "grad_norm": 3.8736650943756104, "learning_rate": 4.71700259768931e-08, "logits/chosen": -1.5389162302017212, "logits/rejected": -1.5327080488204956, "logps/chosen": -50.4788932800293, "logps/rejected": -51.98632049560547, "loss": 0.6923, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0025045094080269337, "rewards/margins": 0.0018028088379651308, "rewards/rejected": 0.0007017005700618029, "step": 990 }, { "epoch": 0.7204610951008645, "grad_norm": 2.781522035598755, "learning_rate": 4.707237762197549e-08, "logits/chosen": -1.5228068828582764, "logits/rejected": -1.5111085176467896, "logps/chosen": -47.027870178222656, "logps/rejected": -49.01156997680664, "loss": 0.6923, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0033112752716988325, "rewards/margins": 0.0017588225891813636, "rewards/rejected": 0.0015524530317634344, "step": 1000 }, { "epoch": 0.7276657060518732, "grad_norm": 3.929330825805664, "learning_rate": 4.697317767005265e-08, "logits/chosen": -1.5303703546524048, "logits/rejected": -1.5189971923828125, "logps/chosen": -43.00761795043945, "logps/rejected": -44.80951690673828, "loss": 0.692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0029724142514169216, "rewards/margins": 0.0022910248953849077, "rewards/rejected": 0.0006813893560320139, "step": 1010 }, { "epoch": 0.7348703170028819, "grad_norm": 2.842343807220459, "learning_rate": 4.6872433094471577e-08, "logits/chosen": -1.548322319984436, "logits/rejected": -1.5355441570281982, "logps/chosen": -46.55397415161133, "logps/rejected": -48.415069580078125, "loss": 0.6923, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0027256556786596775, "rewards/margins": 0.001606556586921215, "rewards/rejected": 0.0011190990917384624, "step": 1020 }, { "epoch": 0.7420749279538905, "grad_norm": 2.672954559326172, "learning_rate": 4.677015097715994e-08, "logits/chosen": -1.4801313877105713, "logits/rejected": -1.4725544452667236, "logps/chosen": -43.47493362426758, "logps/rejected": -46.784507751464844, "loss": 0.6917, "rewards/accuracies": 0.53125, "rewards/chosen": 0.003656183136627078, "rewards/margins": 0.0029149625916033983, "rewards/rejected": 0.0007412207196466625, "step": 1030 }, { "epoch": 0.7492795389048992, "grad_norm": 2.4194390773773193, "learning_rate": 4.666633850812825e-08, "logits/chosen": -1.5239073038101196, "logits/rejected": -1.507598638534546, "logps/chosen": -46.25876998901367, "logps/rejected": -48.20856475830078, "loss": 0.6921, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0025006632786244154, "rewards/margins": 0.0021574136335402727, "rewards/rejected": 0.0003432496450841427, "step": 1040 }, { "epoch": 0.7564841498559077, "grad_norm": 2.3646445274353027, "learning_rate": 4.656100298496439e-08, "logits/chosen": -1.4333226680755615, "logits/rejected": -1.4197046756744385, "logps/chosen": -41.23944091796875, "logps/rejected": -44.33473205566406, "loss": 0.6916, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0036236427258700132, "rewards/margins": 0.0031666383147239685, "rewards/rejected": 0.00045700446935370564, "step": 1050 }, { "epoch": 0.7636887608069164, "grad_norm": 3.0884833335876465, "learning_rate": 4.6454151812320715e-08, "logits/chosen": -1.5102076530456543, "logits/rejected": -1.484427809715271, "logps/chosen": -47.206207275390625, "logps/rejected": -48.68370819091797, "loss": 0.6917, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.003298333380371332, "rewards/margins": 0.0030018885154277086, "rewards/rejected": 0.00029644512687809765, "step": 1060 }, { "epoch": 0.770893371757925, "grad_norm": 3.838554859161377, "learning_rate": 4.6345792501393434e-08, "logits/chosen": -1.4992831945419312, "logits/rejected": -1.4937461614608765, "logps/chosen": -53.69269943237305, "logps/rejected": -57.73991012573242, "loss": 0.6916, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.00392124941572547, "rewards/margins": 0.0031723701395094395, "rewards/rejected": 0.0007488794508390129, "step": 1070 }, { "epoch": 0.7780979827089337, "grad_norm": 3.0563292503356934, "learning_rate": 4.6235932669394676e-08, "logits/chosen": -1.5073121786117554, "logits/rejected": -1.4990915060043335, "logps/chosen": -48.08951950073242, "logps/rejected": -51.09489822387695, "loss": 0.6915, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.004118567798286676, "rewards/margins": 0.0033183638006448746, "rewards/rejected": 0.0008002036483958364, "step": 1080 }, { "epoch": 0.7853025936599424, "grad_norm": 3.5673720836639404, "learning_rate": 4.612458003901698e-08, "logits/chosen": -1.5294950008392334, "logits/rejected": -1.523989200592041, "logps/chosen": -52.46628952026367, "logps/rejected": -56.08174514770508, "loss": 0.6913, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.0030290051363408566, "rewards/margins": 0.0037631914019584656, "rewards/rejected": -0.0007341863238252699, "step": 1090 }, { "epoch": 0.792507204610951, "grad_norm": 3.1977949142456055, "learning_rate": 4.6011742437890476e-08, "logits/chosen": -1.538527011871338, "logits/rejected": -1.5158568620681763, "logps/chosen": -47.33205795288086, "logps/rejected": -48.808815002441406, "loss": 0.6919, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0029173900838941336, "rewards/margins": 0.002560637192800641, "rewards/rejected": 0.0003567528910934925, "step": 1100 }, { "epoch": 0.7997118155619597, "grad_norm": 2.2893710136413574, "learning_rate": 4.589742779803259e-08, "logits/chosen": -1.5475555658340454, "logits/rejected": -1.5350109338760376, "logps/chosen": -46.484588623046875, "logps/rejected": -48.64214324951172, "loss": 0.6919, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0027208970859646797, "rewards/margins": 0.002428521169349551, "rewards/rejected": 0.0002923758584074676, "step": 1110 }, { "epoch": 0.8069164265129684, "grad_norm": 2.6367037296295166, "learning_rate": 4.5781644155290486e-08, "logits/chosen": -1.4872174263000488, "logits/rejected": -1.4771387577056885, "logps/chosen": -45.5018424987793, "logps/rejected": -46.52324676513672, "loss": 0.6913, "rewards/accuracies": 0.625, "rewards/chosen": 0.004312233533710241, "rewards/margins": 0.0036662842612713575, "rewards/rejected": 0.0006459490396082401, "step": 1120 }, { "epoch": 0.8141210374639769, "grad_norm": 2.842519521713257, "learning_rate": 4.566439964877613e-08, "logits/chosen": -1.5221550464630127, "logits/rejected": -1.5159690380096436, "logps/chosen": -43.44307327270508, "logps/rejected": -45.21900177001953, "loss": 0.6922, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.002209798665717244, "rewards/margins": 0.0018761273240670562, "rewards/rejected": 0.00033367107971571386, "step": 1130 }, { "epoch": 0.8213256484149856, "grad_norm": 2.909646511077881, "learning_rate": 4.554570252029421e-08, "logits/chosen": -1.5703493356704712, "logits/rejected": -1.5615266561508179, "logps/chosen": -46.792572021484375, "logps/rejected": -49.12213134765625, "loss": 0.691, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.004547302611172199, "rewards/margins": 0.004351162351667881, "rewards/rejected": 0.00019614025950431824, "step": 1140 }, { "epoch": 0.8285302593659942, "grad_norm": 2.6748578548431396, "learning_rate": 4.542556111376274e-08, "logits/chosen": -1.5654969215393066, "logits/rejected": -1.554164171218872, "logps/chosen": -48.726966857910156, "logps/rejected": -50.896202087402344, "loss": 0.6917, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.003084657248109579, "rewards/margins": 0.0029002639930695295, "rewards/rejected": 0.00018439313862472773, "step": 1150 }, { "epoch": 0.8357348703170029, "grad_norm": 3.0902979373931885, "learning_rate": 4.5303983874626506e-08, "logits/chosen": -1.5411643981933594, "logits/rejected": -1.5292751789093018, "logps/chosen": -50.65459060668945, "logps/rejected": -51.3670654296875, "loss": 0.6919, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0027450700290501118, "rewards/margins": 0.0025953901931643486, "rewards/rejected": 0.0001496800541644916, "step": 1160 }, { "epoch": 0.8429394812680115, "grad_norm": 3.544069290161133, "learning_rate": 4.518097934926339e-08, "logits/chosen": -1.4598362445831299, "logits/rejected": -1.4344513416290283, "logps/chosen": -46.78550338745117, "logps/rejected": -46.877071380615234, "loss": 0.6916, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0031762472353875637, "rewards/margins": 0.0031154484022408724, "rewards/rejected": 6.079913509893231e-05, "step": 1170 }, { "epoch": 0.8501440922190202, "grad_norm": 3.858111619949341, "learning_rate": 4.505655618438363e-08, "logits/chosen": -1.424443244934082, "logits/rejected": -1.4106619358062744, "logps/chosen": -48.665103912353516, "logps/rejected": -49.68077850341797, "loss": 0.6917, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0031317579559981823, "rewards/margins": 0.002937522018328309, "rewards/rejected": 0.00019423609774094075, "step": 1180 }, { "epoch": 0.8573487031700289, "grad_norm": 2.923030376434326, "learning_rate": 4.4930723126421945e-08, "logits/chosen": -1.589641809463501, "logits/rejected": -1.566506266593933, "logps/chosen": -49.06594467163086, "logps/rejected": -50.39019012451172, "loss": 0.6915, "rewards/accuracies": 0.59375, "rewards/chosen": 0.003074865322560072, "rewards/margins": 0.0032937501091510057, "rewards/rejected": -0.00021888469927944243, "step": 1190 }, { "epoch": 0.8645533141210374, "grad_norm": 3.2700655460357666, "learning_rate": 4.48034890209227e-08, "logits/chosen": -1.4649052619934082, "logits/rejected": -1.4451799392700195, "logps/chosen": -51.660499572753906, "logps/rejected": -53.60346221923828, "loss": 0.6915, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0028472330886870623, "rewards/margins": 0.003242159727960825, "rewards/rejected": -0.0003949264937546104, "step": 1200 }, { "epoch": 0.8717579250720461, "grad_norm": 2.6366593837738037, "learning_rate": 4.4674862811918155e-08, "logits/chosen": -1.4467828273773193, "logits/rejected": -1.4440934658050537, "logps/chosen": -43.3638801574707, "logps/rejected": -46.49591064453125, "loss": 0.6916, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0036537572741508484, "rewards/margins": 0.0032189779449254274, "rewards/rejected": 0.00043477994040586054, "step": 1210 }, { "epoch": 0.8789625360230547, "grad_norm": 3.452131748199463, "learning_rate": 4.454485354129966e-08, "logits/chosen": -1.4948376417160034, "logits/rejected": -1.4891588687896729, "logps/chosen": -46.58236312866211, "logps/rejected": -50.1088981628418, "loss": 0.6912, "rewards/accuracies": 0.5625, "rewards/chosen": 0.003989598713815212, "rewards/margins": 0.0038787820376455784, "rewards/rejected": 0.00011081698175985366, "step": 1220 }, { "epoch": 0.8861671469740634, "grad_norm": 2.979247570037842, "learning_rate": 4.4413470348182124e-08, "logits/chosen": -1.446300745010376, "logits/rejected": -1.4229693412780762, "logps/chosen": -48.74427032470703, "logps/rejected": -50.7912483215332, "loss": 0.6913, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0037915543653070927, "rewards/margins": 0.003707052441313863, "rewards/rejected": 8.450145833194256e-05, "step": 1230 }, { "epoch": 0.8933717579250721, "grad_norm": 3.6042563915252686, "learning_rate": 4.42807224682615e-08, "logits/chosen": -1.5023893117904663, "logits/rejected": -1.4898313283920288, "logps/chosen": -42.887596130371094, "logps/rejected": -45.985328674316406, "loss": 0.6909, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0037057988811284304, "rewards/margins": 0.0045966231264173985, "rewards/rejected": -0.0008908241870813072, "step": 1240 }, { "epoch": 0.9005763688760807, "grad_norm": 2.529266595840454, "learning_rate": 4.4146619233165604e-08, "logits/chosen": -1.5505568981170654, "logits/rejected": -1.5460566282272339, "logps/chosen": -50.64966583251953, "logps/rejected": -54.0062370300293, "loss": 0.6917, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.003028963692486286, "rewards/margins": 0.002853470155969262, "rewards/rejected": 0.00017549384210724384, "step": 1250 }, { "epoch": 0.9077809798270894, "grad_norm": 3.1239147186279297, "learning_rate": 4.4011170069798126e-08, "logits/chosen": -1.505274772644043, "logits/rejected": -1.5213569402694702, "logps/chosen": -46.48722457885742, "logps/rejected": -53.446556091308594, "loss": 0.6916, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.002888137474656105, "rewards/margins": 0.0031193068716675043, "rewards/rejected": -0.00023116909142117947, "step": 1260 }, { "epoch": 0.9149855907780979, "grad_norm": 3.1936755180358887, "learning_rate": 4.387438449967594e-08, "logits/chosen": -1.4545634984970093, "logits/rejected": -1.4413819313049316, "logps/chosen": -45.36618423461914, "logps/rejected": -47.985939025878906, "loss": 0.6905, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.004917544312775135, "rewards/margins": 0.005304847843945026, "rewards/rejected": -0.0003873028326779604, "step": 1270 }, { "epoch": 0.9221902017291066, "grad_norm": 3.4427430629730225, "learning_rate": 4.373627213825983e-08, "logits/chosen": -1.6089175939559937, "logits/rejected": -1.598972201347351, "logps/chosen": -46.191490173339844, "logps/rejected": -49.68521499633789, "loss": 0.6906, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.004848025739192963, "rewards/margins": 0.0052161673083901405, "rewards/rejected": -0.00036814124905504286, "step": 1280 }, { "epoch": 0.9293948126801153, "grad_norm": 2.483668088912964, "learning_rate": 4.359684269427848e-08, "logits/chosen": -1.5662097930908203, "logits/rejected": -1.5633054971694946, "logps/chosen": -45.598777770996094, "logps/rejected": -49.13982009887695, "loss": 0.691, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.00441353302448988, "rewards/margins": 0.00428149476647377, "rewards/rejected": 0.00013203815615270287, "step": 1290 }, { "epoch": 0.9365994236311239, "grad_norm": 3.012500286102295, "learning_rate": 4.34561059690461e-08, "logits/chosen": -1.6093149185180664, "logits/rejected": -1.6080818176269531, "logps/chosen": -47.28533172607422, "logps/rejected": -48.898536682128906, "loss": 0.6922, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.002217743080109358, "rewards/margins": 0.001927342265844345, "rewards/rejected": 0.0002904009015765041, "step": 1300 }, { "epoch": 0.9438040345821326, "grad_norm": 2.672034740447998, "learning_rate": 4.3314071855773314e-08, "logits/chosen": -1.5703166723251343, "logits/rejected": -1.5722987651824951, "logps/chosen": -41.89187240600586, "logps/rejected": -45.08455276489258, "loss": 0.6914, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0034990236163139343, "rewards/margins": 0.0036229409743100405, "rewards/rejected": -0.00012391725613269955, "step": 1310 }, { "epoch": 0.9510086455331412, "grad_norm": 3.103219985961914, "learning_rate": 4.3170750338871806e-08, "logits/chosen": -1.5061099529266357, "logits/rejected": -1.491026759147644, "logps/chosen": -46.509437561035156, "logps/rejected": -49.67821502685547, "loss": 0.6909, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.004872228484600782, "rewards/margins": 0.004575548693537712, "rewards/rejected": 0.0002966797037515789, "step": 1320 }, { "epoch": 0.9582132564841499, "grad_norm": 3.005204677581787, "learning_rate": 4.3026151493252414e-08, "logits/chosen": -1.5492498874664307, "logits/rejected": -1.5279552936553955, "logps/chosen": -51.47428512573242, "logps/rejected": -52.905067443847656, "loss": 0.6908, "rewards/accuracies": 0.625, "rewards/chosen": 0.004221352748572826, "rewards/margins": 0.0048477440141141415, "rewards/rejected": -0.0006263910909183323, "step": 1330 }, { "epoch": 0.9654178674351584, "grad_norm": 3.5376358032226562, "learning_rate": 4.2880285483616895e-08, "logits/chosen": -1.533168077468872, "logits/rejected": -1.5301328897476196, "logps/chosen": -45.71091842651367, "logps/rejected": -49.009521484375, "loss": 0.6912, "rewards/accuracies": 0.59375, "rewards/chosen": 0.003885247278958559, "rewards/margins": 0.003925333730876446, "rewards/rejected": -4.0086473745759577e-05, "step": 1340 }, { "epoch": 0.9726224783861671, "grad_norm": 2.741413116455078, "learning_rate": 4.273316256374342e-08, "logits/chosen": -1.403411865234375, "logits/rejected": -1.3984428644180298, "logps/chosen": -52.229103088378906, "logps/rejected": -53.140968322753906, "loss": 0.6911, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0044440277852118015, "rewards/margins": 0.004066800232976675, "rewards/rejected": 0.00037722705747000873, "step": 1350 }, { "epoch": 0.9798270893371758, "grad_norm": 3.3762712478637695, "learning_rate": 4.258479307576576e-08, "logits/chosen": -1.5002410411834717, "logits/rejected": -1.495216727256775, "logps/chosen": -43.78204345703125, "logps/rejected": -45.711483001708984, "loss": 0.6903, "rewards/accuracies": 0.59375, "rewards/chosen": 0.005402544513344765, "rewards/margins": 0.005806138273328543, "rewards/rejected": -0.00040359393460676074, "step": 1360 }, { "epoch": 0.9870317002881844, "grad_norm": 2.7428503036499023, "learning_rate": 4.243518744944626e-08, "logits/chosen": -1.5039384365081787, "logits/rejected": -1.5001270771026611, "logps/chosen": -43.2661247253418, "logps/rejected": -47.12836456298828, "loss": 0.6908, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.004102085717022419, "rewards/margins": 0.004668028559535742, "rewards/rejected": -0.0005659434827975929, "step": 1370 }, { "epoch": 0.9942363112391931, "grad_norm": 3.406386137008667, "learning_rate": 4.22843562014427e-08, "logits/chosen": -1.4495034217834473, "logits/rejected": -1.4397344589233398, "logps/chosen": -46.86280059814453, "logps/rejected": -49.02347946166992, "loss": 0.6917, "rewards/accuracies": 0.625, "rewards/chosen": 0.0041961586102843285, "rewards/margins": 0.0028399438597261906, "rewards/rejected": 0.001356214052066207, "step": 1380 }, { "epoch": 1.0014409221902016, "grad_norm": 3.115910053253174, "learning_rate": 4.2132309934569e-08, "logits/chosen": -1.5670406818389893, "logits/rejected": -1.5619332790374756, "logps/chosen": -43.755001068115234, "logps/rejected": -46.139137268066406, "loss": 0.6912, "rewards/accuracies": 0.59375, "rewards/chosen": 0.005227426066994667, "rewards/margins": 0.003935725893825293, "rewards/rejected": 0.0012916993582621217, "step": 1390 }, { "epoch": 1.0086455331412103, "grad_norm": 2.4783718585968018, "learning_rate": 4.197905933704989e-08, "logits/chosen": -1.4311118125915527, "logits/rejected": -1.421370029449463, "logps/chosen": -47.266395568847656, "logps/rejected": -49.955780029296875, "loss": 0.6897, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.005098997615277767, "rewards/margins": 0.007058604154735804, "rewards/rejected": -0.001959607470780611, "step": 1400 }, { "epoch": 1.015850144092219, "grad_norm": 2.7212026119232178, "learning_rate": 4.1824615181769577e-08, "logits/chosen": -1.486352562904358, "logits/rejected": -1.4931161403656006, "logps/chosen": -43.8072509765625, "logps/rejected": -47.75897216796875, "loss": 0.6901, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.005026941187679768, "rewards/margins": 0.006201364565640688, "rewards/rejected": -0.0011744231451302767, "step": 1410 }, { "epoch": 1.0230547550432276, "grad_norm": 3.118551254272461, "learning_rate": 4.1668988325514434e-08, "logits/chosen": -1.524102807044983, "logits/rejected": -1.5137133598327637, "logps/chosen": -49.41641616821289, "logps/rejected": -52.12779998779297, "loss": 0.6899, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.004953524563461542, "rewards/margins": 0.00664835050702095, "rewards/rejected": -0.0016948258271440864, "step": 1420 }, { "epoch": 1.0302593659942363, "grad_norm": 3.0052857398986816, "learning_rate": 4.1512189708209844e-08, "logits/chosen": -1.5740153789520264, "logits/rejected": -1.5645209550857544, "logps/chosen": -38.22710418701172, "logps/rejected": -39.43300247192383, "loss": 0.6906, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0053091454319655895, "rewards/margins": 0.005104938056319952, "rewards/rejected": 0.00020420753571670502, "step": 1430 }, { "epoch": 1.037463976945245, "grad_norm": 3.6204934120178223, "learning_rate": 4.1354230352151143e-08, "logits/chosen": -1.5043399333953857, "logits/rejected": -1.4913597106933594, "logps/chosen": -56.45264434814453, "logps/rejected": -56.661643981933594, "loss": 0.6907, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.00407161470502615, "rewards/margins": 0.004998006857931614, "rewards/rejected": -0.0009263925021514297, "step": 1440 }, { "epoch": 1.0446685878962536, "grad_norm": 2.5718061923980713, "learning_rate": 4.119512136122882e-08, "logits/chosen": -1.6087749004364014, "logits/rejected": -1.621289610862732, "logps/chosen": -42.30548095703125, "logps/rejected": -48.45243453979492, "loss": 0.6898, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.004055247642099857, "rewards/margins": 0.0067529031075537205, "rewards/rejected": -0.002697654766961932, "step": 1450 }, { "epoch": 1.0518731988472623, "grad_norm": 3.4756808280944824, "learning_rate": 4.103487392014795e-08, "logits/chosen": -1.4752939939498901, "logits/rejected": -1.4567186832427979, "logps/chosen": -46.37599563598633, "logps/rejected": -51.025108337402344, "loss": 0.6888, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.005677501205354929, "rewards/margins": 0.008715528063476086, "rewards/rejected": -0.0030380270909518003, "step": 1460 }, { "epoch": 1.059077809798271, "grad_norm": 2.909818649291992, "learning_rate": 4.087349929364192e-08, "logits/chosen": -1.565161108970642, "logits/rejected": -1.5442824363708496, "logps/chosen": -42.54533386230469, "logps/rejected": -45.89342498779297, "loss": 0.6895, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.00476106209680438, "rewards/margins": 0.007298412267118692, "rewards/rejected": -0.002537350170314312, "step": 1470 }, { "epoch": 1.0662824207492796, "grad_norm": 2.4459564685821533, "learning_rate": 4.0711008825680645e-08, "logits/chosen": -1.504279375076294, "logits/rejected": -1.4850876331329346, "logps/chosen": -47.31806945800781, "logps/rejected": -50.19545364379883, "loss": 0.6903, "rewards/accuracies": 0.625, "rewards/chosen": 0.004515786189585924, "rewards/margins": 0.005739896558225155, "rewards/rejected": -0.0012241104850545526, "step": 1480 }, { "epoch": 1.0734870317002883, "grad_norm": 3.5417470932006836, "learning_rate": 4.054741393867306e-08, "logits/chosen": -1.4754365682601929, "logits/rejected": -1.464170217514038, "logps/chosen": -54.07590866088867, "logps/rejected": -55.700416564941406, "loss": 0.6903, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0043455869890749454, "rewards/margins": 0.005824446678161621, "rewards/rejected": -0.0014788598055019975, "step": 1490 }, { "epoch": 1.080691642651297, "grad_norm": 2.9604134559631348, "learning_rate": 4.038272613266419e-08, "logits/chosen": -1.5457851886749268, "logits/rejected": -1.521743655204773, "logps/chosen": -44.90116882324219, "logps/rejected": -47.38478088378906, "loss": 0.6903, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.004831579513847828, "rewards/margins": 0.005809755530208349, "rewards/rejected": -0.0009781757835298777, "step": 1500 }, { "epoch": 1.0878962536023056, "grad_norm": 3.386324882507324, "learning_rate": 4.0216956984526784e-08, "logits/chosen": -1.5541696548461914, "logits/rejected": -1.5498359203338623, "logps/chosen": -42.88204574584961, "logps/rejected": -45.56418228149414, "loss": 0.6896, "rewards/accuracies": 0.625, "rewards/chosen": 0.0052653830498456955, "rewards/margins": 0.00714817363768816, "rewards/rejected": -0.0018827903550118208, "step": 1510 }, { "epoch": 1.0951008645533142, "grad_norm": 3.1095175743103027, "learning_rate": 4.0050118147147446e-08, "logits/chosen": -1.5181224346160889, "logits/rejected": -1.5101962089538574, "logps/chosen": -53.42927169799805, "logps/rejected": -52.1214714050293, "loss": 0.6922, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.002941600512713194, "rewards/margins": 0.00190871418453753, "rewards/rejected": 0.0010328865610063076, "step": 1520 }, { "epoch": 1.1023054755043227, "grad_norm": 3.0901896953582764, "learning_rate": 3.988222134860755e-08, "logits/chosen": -1.5638874769210815, "logits/rejected": -1.5509564876556396, "logps/chosen": -47.26706314086914, "logps/rejected": -51.6654167175293, "loss": 0.6898, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.005253427661955357, "rewards/margins": 0.00679327268153429, "rewards/rejected": -0.0015398439718410373, "step": 1530 }, { "epoch": 1.1095100864553313, "grad_norm": 3.111027717590332, "learning_rate": 3.9713278391358724e-08, "logits/chosen": -1.5749475955963135, "logits/rejected": -1.5632776021957397, "logps/chosen": -45.95053482055664, "logps/rejected": -49.204917907714844, "loss": 0.6903, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.005047272425144911, "rewards/margins": 0.005783633328974247, "rewards/rejected": -0.0007363610784523189, "step": 1540 }, { "epoch": 1.11671469740634, "grad_norm": 2.466688632965088, "learning_rate": 3.954330115139328e-08, "logits/chosen": -1.5432218313217163, "logits/rejected": -1.5328999757766724, "logps/chosen": -46.747894287109375, "logps/rejected": -48.846595764160156, "loss": 0.6898, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0048002442345023155, "rewards/margins": 0.006832278333604336, "rewards/rejected": -0.0020320338662713766, "step": 1550 }, { "epoch": 1.1239193083573487, "grad_norm": 4.091006278991699, "learning_rate": 3.937230157740931e-08, "logits/chosen": -1.591812252998352, "logits/rejected": -1.5731406211853027, "logps/chosen": -47.905906677246094, "logps/rejected": -51.46516799926758, "loss": 0.6893, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0062894513830542564, "rewards/margins": 0.007767542265355587, "rewards/rejected": -0.001478090649470687, "step": 1560 }, { "epoch": 1.1311239193083573, "grad_norm": 2.356121778488159, "learning_rate": 3.920029168997077e-08, "logits/chosen": -1.5560190677642822, "logits/rejected": -1.5427652597427368, "logps/chosen": -48.724090576171875, "logps/rejected": -51.52899932861328, "loss": 0.6903, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.004863058216869831, "rewards/margins": 0.005728754214942455, "rewards/rejected": -0.000865696172695607, "step": 1570 }, { "epoch": 1.138328530259366, "grad_norm": 3.9722375869750977, "learning_rate": 3.9027283580662476e-08, "logits/chosen": -1.5196526050567627, "logits/rejected": -1.5073282718658447, "logps/chosen": -49.61786651611328, "logps/rejected": -52.77516555786133, "loss": 0.6888, "rewards/accuracies": 0.65625, "rewards/chosen": 0.005080161150544882, "rewards/margins": 0.008876914158463478, "rewards/rejected": -0.003796751843765378, "step": 1580 }, { "epoch": 1.1455331412103746, "grad_norm": 3.991994619369507, "learning_rate": 3.885328941124014e-08, "logits/chosen": -1.5015205144882202, "logits/rejected": -1.488493800163269, "logps/chosen": -45.93663787841797, "logps/rejected": -50.631431579589844, "loss": 0.6896, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.005211810581386089, "rewards/margins": 0.0071194409392774105, "rewards/rejected": -0.0019076305907219648, "step": 1590 }, { "epoch": 1.1527377521613833, "grad_norm": 3.034956216812134, "learning_rate": 3.867832141277539e-08, "logits/chosen": -1.5487406253814697, "logits/rejected": -1.5294761657714844, "logps/chosen": -49.104705810546875, "logps/rejected": -51.280723571777344, "loss": 0.69, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.004329115618020296, "rewards/margins": 0.006378514226526022, "rewards/rejected": -0.002049399074167013, "step": 1600 }, { "epoch": 1.159942363112392, "grad_norm": 3.430689573287964, "learning_rate": 3.850239188479606e-08, "logits/chosen": -1.462114930152893, "logits/rejected": -1.4583441019058228, "logps/chosen": -46.75814437866211, "logps/rejected": -49.108917236328125, "loss": 0.6901, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.004666672088205814, "rewards/margins": 0.006269653793424368, "rewards/rejected": -0.0016029814723879099, "step": 1610 }, { "epoch": 1.1671469740634006, "grad_norm": 3.5557363033294678, "learning_rate": 3.832551319442151e-08, "logits/chosen": -1.5857681035995483, "logits/rejected": -1.5824248790740967, "logps/chosen": -49.61214065551758, "logps/rejected": -53.804893493652344, "loss": 0.69, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.005068537779152393, "rewards/margins": 0.006406673230230808, "rewards/rejected": -0.0013381352182477713, "step": 1620 }, { "epoch": 1.1743515850144093, "grad_norm": 4.248888969421387, "learning_rate": 3.81476977754933e-08, "logits/chosen": -1.399951696395874, "logits/rejected": -1.3853706121444702, "logps/chosen": -51.313499450683594, "logps/rejected": -50.658668518066406, "loss": 0.6902, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.004184350371360779, "rewards/margins": 0.0060124825686216354, "rewards/rejected": -0.0018281325465068221, "step": 1630 }, { "epoch": 1.181556195965418, "grad_norm": 2.400538206100464, "learning_rate": 3.796895812770114e-08, "logits/chosen": -1.5025560855865479, "logits/rejected": -1.4927390813827515, "logps/chosen": -45.765350341796875, "logps/rejected": -47.244354248046875, "loss": 0.6894, "rewards/accuracies": 0.59375, "rewards/chosen": 0.006023012101650238, "rewards/margins": 0.0076335109770298, "rewards/rejected": -0.0016104992246255279, "step": 1640 }, { "epoch": 1.1887608069164266, "grad_norm": 3.0615625381469727, "learning_rate": 3.7789306815704216e-08, "logits/chosen": -1.527266025543213, "logits/rejected": -1.516747236251831, "logps/chosen": -40.9234733581543, "logps/rejected": -42.032447814941406, "loss": 0.6907, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0036138053983449936, "rewards/margins": 0.004900342784821987, "rewards/rejected": -0.00128653715364635, "step": 1650 }, { "epoch": 1.195965417867435, "grad_norm": 2.6502482891082764, "learning_rate": 3.760875646824795e-08, "logits/chosen": -1.3961491584777832, "logits/rejected": -1.397456407546997, "logps/chosen": -46.144996643066406, "logps/rejected": -48.4418830871582, "loss": 0.6897, "rewards/accuracies": 0.625, "rewards/chosen": 0.003492361633107066, "rewards/margins": 0.007100371178239584, "rewards/rejected": -0.003608010010793805, "step": 1660 }, { "epoch": 1.2031700288184437, "grad_norm": 3.6749885082244873, "learning_rate": 3.742731977727623e-08, "logits/chosen": -1.5396411418914795, "logits/rejected": -1.5324022769927979, "logps/chosen": -45.20623016357422, "logps/rejected": -49.124549865722656, "loss": 0.6895, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.006122785620391369, "rewards/margins": 0.007418873719871044, "rewards/rejected": -0.0012960887979716063, "step": 1670 }, { "epoch": 1.2103746397694524, "grad_norm": 3.708371877670288, "learning_rate": 3.7245009497039244e-08, "logits/chosen": -1.4357057809829712, "logits/rejected": -1.420304298400879, "logps/chosen": -45.42781066894531, "logps/rejected": -49.5192756652832, "loss": 0.6891, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0038997051306068897, "rewards/margins": 0.008227763697504997, "rewards/rejected": -0.0043280585668981075, "step": 1680 }, { "epoch": 1.217579250720461, "grad_norm": 2.6539306640625, "learning_rate": 3.7061838443196886e-08, "logits/chosen": -1.511749029159546, "logits/rejected": -1.502352237701416, "logps/chosen": -50.04130172729492, "logps/rejected": -52.17155075073242, "loss": 0.6885, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.006058938335627317, "rewards/margins": 0.00949786975979805, "rewards/rejected": -0.003438931657001376, "step": 1690 }, { "epoch": 1.2247838616714697, "grad_norm": 3.0108015537261963, "learning_rate": 3.68778194919179e-08, "logits/chosen": -1.472414255142212, "logits/rejected": -1.4666353464126587, "logps/chosen": -50.08484649658203, "logps/rejected": -53.3315315246582, "loss": 0.6878, "rewards/accuracies": 0.6875, "rewards/chosen": 0.008036890998482704, "rewards/margins": 0.010940475389361382, "rewards/rejected": -0.0029035855550318956, "step": 1700 }, { "epoch": 1.2319884726224783, "grad_norm": 3.6009788513183594, "learning_rate": 3.66929655789747e-08, "logits/chosen": -1.5710514783859253, "logits/rejected": -1.5530909299850464, "logps/chosen": -41.97422790527344, "logps/rejected": -46.53120040893555, "loss": 0.689, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.005440887995064259, "rewards/margins": 0.008487561717629433, "rewards/rejected": -0.0030466741882264614, "step": 1710 }, { "epoch": 1.239193083573487, "grad_norm": 2.323777437210083, "learning_rate": 3.6507289698834064e-08, "logits/chosen": -1.4719207286834717, "logits/rejected": -1.455365538597107, "logps/chosen": -43.55077362060547, "logps/rejected": -46.05921173095703, "loss": 0.6891, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.005178916268050671, "rewards/margins": 0.008271681144833565, "rewards/rejected": -0.003092765109613538, "step": 1720 }, { "epoch": 1.2463976945244957, "grad_norm": 4.027398109436035, "learning_rate": 3.6320804903743684e-08, "logits/chosen": -1.5161298513412476, "logits/rejected": -1.5111939907073975, "logps/chosen": -45.40093231201172, "logps/rejected": -49.17858123779297, "loss": 0.6887, "rewards/accuracies": 0.65625, "rewards/chosen": 0.003557295771315694, "rewards/margins": 0.009094839915633202, "rewards/rejected": -0.005537544842809439, "step": 1730 }, { "epoch": 1.2536023054755043, "grad_norm": 2.655517816543579, "learning_rate": 3.61335243028146e-08, "logits/chosen": -1.4958670139312744, "logits/rejected": -1.489793062210083, "logps/chosen": -48.872459411621094, "logps/rejected": -51.61071014404297, "loss": 0.6891, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004584385547786951, "rewards/margins": 0.008167969062924385, "rewards/rejected": -0.003583582118153572, "step": 1740 }, { "epoch": 1.260806916426513, "grad_norm": 3.2628414630889893, "learning_rate": 3.5945461061099736e-08, "logits/chosen": -1.4390538930892944, "logits/rejected": -1.4079252481460571, "logps/chosen": -50.76530075073242, "logps/rejected": -49.67983627319336, "loss": 0.6872, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.006600284017622471, "rewards/margins": 0.012185483239591122, "rewards/rejected": -0.005585200153291225, "step": 1750 }, { "epoch": 1.2680115273775217, "grad_norm": 2.9704713821411133, "learning_rate": 3.5756628398668446e-08, "logits/chosen": -1.5589011907577515, "logits/rejected": -1.5578845739364624, "logps/chosen": -51.247100830078125, "logps/rejected": -53.67809295654297, "loss": 0.6887, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0037506825756281614, "rewards/margins": 0.009181154891848564, "rewards/rejected": -0.005430473946034908, "step": 1760 }, { "epoch": 1.2752161383285303, "grad_norm": 2.6625916957855225, "learning_rate": 3.556703958967716e-08, "logits/chosen": -1.5575711727142334, "logits/rejected": -1.5438311100006104, "logps/chosen": -44.35518264770508, "logps/rejected": -47.987945556640625, "loss": 0.6896, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.003589153289794922, "rewards/margins": 0.007138341665267944, "rewards/rejected": -0.0035491890739649534, "step": 1770 }, { "epoch": 1.282420749279539, "grad_norm": 4.012426853179932, "learning_rate": 3.5376707961436297e-08, "logits/chosen": -1.5312955379486084, "logits/rejected": -1.5150017738342285, "logps/chosen": -53.351661682128906, "logps/rejected": -53.443603515625, "loss": 0.6907, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.004278816748410463, "rewards/margins": 0.004905478097498417, "rewards/rejected": -0.0006266612326726317, "step": 1780 }, { "epoch": 1.2896253602305476, "grad_norm": 2.424056053161621, "learning_rate": 3.51856468934734e-08, "logits/chosen": -1.4922749996185303, "logits/rejected": -1.4954168796539307, "logps/chosen": -46.36159896850586, "logps/rejected": -48.64609146118164, "loss": 0.6913, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.004488944076001644, "rewards/margins": 0.0038774623535573483, "rewards/rejected": 0.0006114812567830086, "step": 1790 }, { "epoch": 1.2968299711815563, "grad_norm": 3.357475519180298, "learning_rate": 3.499386981659262e-08, "logits/chosen": -1.5789316892623901, "logits/rejected": -1.5703160762786865, "logps/chosen": -45.465980529785156, "logps/rejected": -51.676979064941406, "loss": 0.689, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.006388810463249683, "rewards/margins": 0.00840458832681179, "rewards/rejected": -0.0020157776307314634, "step": 1800 }, { "epoch": 1.304034582132565, "grad_norm": 2.5491249561309814, "learning_rate": 3.480139021193057e-08, "logits/chosen": -1.4629701375961304, "logits/rejected": -1.4627655744552612, "logps/chosen": -46.493797302246094, "logps/rejected": -49.93988800048828, "loss": 0.6896, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.003896316047757864, "rewards/margins": 0.007192063145339489, "rewards/rejected": -0.003295747097581625, "step": 1810 }, { "epoch": 1.3112391930835736, "grad_norm": 4.076901912689209, "learning_rate": 3.4608221610008666e-08, "logits/chosen": -1.5544006824493408, "logits/rejected": -1.5443475246429443, "logps/chosen": -40.67546081542969, "logps/rejected": -45.340274810791016, "loss": 0.6877, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0057144188322126865, "rewards/margins": 0.011151134967803955, "rewards/rejected": -0.005436715669929981, "step": 1820 }, { "epoch": 1.318443804034582, "grad_norm": 2.311035394668579, "learning_rate": 3.4414377589782e-08, "logits/chosen": -1.4895527362823486, "logits/rejected": -1.4889047145843506, "logps/chosen": -44.287559509277344, "logps/rejected": -46.684104919433594, "loss": 0.6891, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.003038702066987753, "rewards/margins": 0.00834103673696518, "rewards/rejected": -0.005302335135638714, "step": 1830 }, { "epoch": 1.3256484149855907, "grad_norm": 2.245774745941162, "learning_rate": 3.4219871777684745e-08, "logits/chosen": -1.5047295093536377, "logits/rejected": -1.4803383350372314, "logps/chosen": -48.24694061279297, "logps/rejected": -49.6685791015625, "loss": 0.6888, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.004406126216053963, "rewards/margins": 0.008816715329885483, "rewards/rejected": -0.00441058911383152, "step": 1840 }, { "epoch": 1.3328530259365994, "grad_norm": 3.1291747093200684, "learning_rate": 3.4024717846672364e-08, "logits/chosen": -1.5541332960128784, "logits/rejected": -1.540783166885376, "logps/chosen": -43.84444808959961, "logps/rejected": -47.0974235534668, "loss": 0.6885, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.003276639385148883, "rewards/margins": 0.009581932798027992, "rewards/rejected": -0.006305294577032328, "step": 1850 }, { "epoch": 1.340057636887608, "grad_norm": 3.2411158084869385, "learning_rate": 3.382892951526036e-08, "logits/chosen": -1.5086630582809448, "logits/rejected": -1.498652696609497, "logps/chosen": -48.55314636230469, "logps/rejected": -53.514312744140625, "loss": 0.688, "rewards/accuracies": 0.625, "rewards/chosen": 0.004771741572767496, "rewards/margins": 0.010389812290668488, "rewards/rejected": -0.005618072114884853, "step": 1860 }, { "epoch": 1.3472622478386167, "grad_norm": 3.0738956928253174, "learning_rate": 3.3632520546559974e-08, "logits/chosen": -1.477526307106018, "logits/rejected": -1.4502986669540405, "logps/chosen": -42.176910400390625, "logps/rejected": -46.28841781616211, "loss": 0.6882, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.005257411859929562, "rewards/margins": 0.009985310956835747, "rewards/rejected": -0.004727899096906185, "step": 1870 }, { "epoch": 1.3544668587896254, "grad_norm": 3.4176902770996094, "learning_rate": 3.34355047473107e-08, "logits/chosen": -1.509404182434082, "logits/rejected": -1.4928141832351685, "logps/chosen": -49.149658203125, "logps/rejected": -50.33115768432617, "loss": 0.6892, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0034352585207670927, "rewards/margins": 0.008046688511967659, "rewards/rejected": -0.004611429758369923, "step": 1880 }, { "epoch": 1.361671469740634, "grad_norm": 3.2581231594085693, "learning_rate": 3.323789596690971e-08, "logits/chosen": -1.4438790082931519, "logits/rejected": -1.4406137466430664, "logps/chosen": -46.02549362182617, "logps/rejected": -50.36063766479492, "loss": 0.6886, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004354453645646572, "rewards/margins": 0.009347590617835522, "rewards/rejected": -0.004993138834834099, "step": 1890 }, { "epoch": 1.3688760806916427, "grad_norm": 2.03442645072937, "learning_rate": 3.303970809643828e-08, "logits/chosen": -1.5256521701812744, "logits/rejected": -1.5279293060302734, "logps/chosen": -45.331443786621094, "logps/rejected": -49.03801727294922, "loss": 0.6891, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00544341467320919, "rewards/margins": 0.00833254773169756, "rewards/rejected": -0.0028891332913190126, "step": 1900 }, { "epoch": 1.3760806916426513, "grad_norm": 3.0210492610931396, "learning_rate": 3.2840955067685356e-08, "logits/chosen": -1.5634491443634033, "logits/rejected": -1.5632786750793457, "logps/chosen": -46.0203742980957, "logps/rejected": -50.457698822021484, "loss": 0.6876, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.005104956682771444, "rewards/margins": 0.011318376287817955, "rewards/rejected": -0.006213418673723936, "step": 1910 }, { "epoch": 1.38328530259366, "grad_norm": 2.749630928039551, "learning_rate": 3.264165085216817e-08, "logits/chosen": -1.5801355838775635, "logits/rejected": -1.5726850032806396, "logps/chosen": -38.53851318359375, "logps/rejected": -43.82078170776367, "loss": 0.6888, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.004419215954840183, "rewards/margins": 0.008898451924324036, "rewards/rejected": -0.004479236900806427, "step": 1920 }, { "epoch": 1.3904899135446687, "grad_norm": 4.1981916427612305, "learning_rate": 3.244180946015008e-08, "logits/chosen": -1.444226861000061, "logits/rejected": -1.4355494976043701, "logps/chosen": -52.143287658691406, "logps/rejected": -53.8681755065918, "loss": 0.6898, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.004708580207079649, "rewards/margins": 0.006951368413865566, "rewards/rejected": -0.0022427875082939863, "step": 1930 }, { "epoch": 1.397694524495677, "grad_norm": 2.518497943878174, "learning_rate": 3.224144493965578e-08, "logits/chosen": -1.5799140930175781, "logits/rejected": -1.5780613422393799, "logps/chosen": -43.635032653808594, "logps/rejected": -45.74082565307617, "loss": 0.6892, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.00375853362493217, "rewards/margins": 0.008155545219779015, "rewards/rejected": -0.004397011827677488, "step": 1940 }, { "epoch": 1.4048991354466858, "grad_norm": 2.8173162937164307, "learning_rate": 3.204057137548371e-08, "logits/chosen": -1.5313746929168701, "logits/rejected": -1.5233030319213867, "logps/chosen": -43.7186164855957, "logps/rejected": -47.32202911376953, "loss": 0.6878, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.0038849213160574436, "rewards/margins": 0.011041805148124695, "rewards/rejected": -0.007156885229051113, "step": 1950 }, { "epoch": 1.4121037463976944, "grad_norm": 3.7198755741119385, "learning_rate": 3.183920288821597e-08, "logits/chosen": -1.4899612665176392, "logits/rejected": -1.4815582036972046, "logps/chosen": -45.28757095336914, "logps/rejected": -49.970062255859375, "loss": 0.6872, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.004886112175881863, "rewards/margins": 0.012144430540502071, "rewards/rejected": -0.007258318364620209, "step": 1960 }, { "epoch": 1.419308357348703, "grad_norm": 3.8075971603393555, "learning_rate": 3.1637353633225735e-08, "logits/chosen": -1.5398638248443604, "logits/rejected": -1.529147744178772, "logps/chosen": -41.222007751464844, "logps/rejected": -45.725135803222656, "loss": 0.6868, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.003631623461842537, "rewards/margins": 0.013042435050010681, "rewards/rejected": -0.009410811588168144, "step": 1970 }, { "epoch": 1.4265129682997117, "grad_norm": 3.284583330154419, "learning_rate": 3.143503779968213e-08, "logits/chosen": -1.5069071054458618, "logits/rejected": -1.5070630311965942, "logps/chosen": -45.44756317138672, "logps/rejected": -49.775611877441406, "loss": 0.6891, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0011150476057082415, "rewards/margins": 0.008462509140372276, "rewards/rejected": -0.00734746316447854, "step": 1980 }, { "epoch": 1.4337175792507204, "grad_norm": 3.3477249145507812, "learning_rate": 3.1232269609552875e-08, "logits/chosen": -1.518206000328064, "logits/rejected": -1.507889986038208, "logps/chosen": -43.67546463012695, "logps/rejected": -46.17768096923828, "loss": 0.6887, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0041655306704342365, "rewards/margins": 0.009150232188403606, "rewards/rejected": -0.004984701983630657, "step": 1990 }, { "epoch": 1.440922190201729, "grad_norm": 2.176100969314575, "learning_rate": 3.102906331660444e-08, "logits/chosen": -1.5566879510879517, "logits/rejected": -1.5430500507354736, "logps/chosen": -41.93559646606445, "logps/rejected": -48.24177932739258, "loss": 0.6867, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.005309706088155508, "rewards/margins": 0.013114815577864647, "rewards/rejected": -0.007805109955370426, "step": 2000 }, { "epoch": 1.4481268011527377, "grad_norm": 3.1868534088134766, "learning_rate": 3.082543320540015e-08, "logits/chosen": -1.469954252243042, "logits/rejected": -1.4548522233963013, "logps/chosen": -43.87824249267578, "logps/rejected": -47.534263610839844, "loss": 0.6883, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00238056224770844, "rewards/margins": 0.009992104955017567, "rewards/rejected": -0.007611544337123632, "step": 2010 }, { "epoch": 1.4553314121037464, "grad_norm": 4.190840721130371, "learning_rate": 3.062139359029599e-08, "logits/chosen": -1.5576965808868408, "logits/rejected": -1.5537203550338745, "logps/chosen": -46.4661979675293, "logps/rejected": -48.89842224121094, "loss": 0.6883, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0034370776265859604, "rewards/margins": 0.010012554004788399, "rewards/rejected": -0.006575475446879864, "step": 2020 }, { "epoch": 1.462536023054755, "grad_norm": 3.4175453186035156, "learning_rate": 3.041695881443437e-08, "logits/chosen": -1.5763541460037231, "logits/rejected": -1.5677953958511353, "logps/chosen": -46.359901428222656, "logps/rejected": -50.328311920166016, "loss": 0.6901, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.003387346165254712, "rewards/margins": 0.006299160420894623, "rewards/rejected": -0.0029118142556399107, "step": 2030 }, { "epoch": 1.4697406340057637, "grad_norm": 4.04388427734375, "learning_rate": 3.0212143248735886e-08, "logits/chosen": -1.5315358638763428, "logits/rejected": -1.5291774272918701, "logps/chosen": -49.797996520996094, "logps/rejected": -54.427490234375, "loss": 0.6878, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.003973301034420729, "rewards/margins": 0.011022644117474556, "rewards/rejected": -0.007049343083053827, "step": 2040 }, { "epoch": 1.4769452449567724, "grad_norm": 3.1518523693084717, "learning_rate": 3.0006961290889077e-08, "logits/chosen": -1.5215927362442017, "logits/rejected": -1.496594786643982, "logps/chosen": -50.674774169921875, "logps/rejected": -53.18156051635742, "loss": 0.6876, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.004915239289402962, "rewards/margins": 0.011425209231674671, "rewards/rejected": -0.0065099699422717094, "step": 2050 }, { "epoch": 1.484149855907781, "grad_norm": 2.6612987518310547, "learning_rate": 2.980142736433833e-08, "logits/chosen": -1.547136902809143, "logits/rejected": -1.5229747295379639, "logps/chosen": -44.282020568847656, "logps/rejected": -44.51871871948242, "loss": 0.6885, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0016722548753023148, "rewards/margins": 0.009514597244560719, "rewards/rejected": -0.00784234143793583, "step": 2060 }, { "epoch": 1.4913544668587897, "grad_norm": 4.097959518432617, "learning_rate": 2.9595555917269997e-08, "logits/chosen": -1.5567786693572998, "logits/rejected": -1.529726266860962, "logps/chosen": -51.577598571777344, "logps/rejected": -53.3016471862793, "loss": 0.6884, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0018792494665831327, "rewards/margins": 0.009681441821157932, "rewards/rejected": -0.0078021930530667305, "step": 2070 }, { "epoch": 1.4985590778097984, "grad_norm": 3.1386377811431885, "learning_rate": 2.9389361421596725e-08, "logits/chosen": -1.435152292251587, "logits/rejected": -1.4320685863494873, "logps/chosen": -49.169029235839844, "logps/rejected": -53.54350662231445, "loss": 0.687, "rewards/accuracies": 0.625, "rewards/chosen": 0.004540695808827877, "rewards/margins": 0.012549139559268951, "rewards/rejected": -0.008008443750441074, "step": 2080 }, { "epoch": 1.505763688760807, "grad_norm": 2.573927640914917, "learning_rate": 2.9182858371940126e-08, "logits/chosen": -1.5321205854415894, "logits/rejected": -1.5178864002227783, "logps/chosen": -42.592002868652344, "logps/rejected": -46.14876174926758, "loss": 0.6866, "rewards/accuracies": 0.625, "rewards/chosen": 0.003068184945732355, "rewards/margins": 0.013397050090134144, "rewards/rejected": -0.01032886654138565, "step": 2090 }, { "epoch": 1.5129682997118157, "grad_norm": 3.6754095554351807, "learning_rate": 2.8976061284611908e-08, "logits/chosen": -1.4699745178222656, "logits/rejected": -1.4811229705810547, "logps/chosen": -41.61306381225586, "logps/rejected": -45.29481506347656, "loss": 0.6877, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004881677217781544, "rewards/margins": 0.0111152408644557, "rewards/rejected": -0.006233563646674156, "step": 2100 }, { "epoch": 1.5201729106628243, "grad_norm": 3.2631003856658936, "learning_rate": 2.8768984696593384e-08, "logits/chosen": -1.4801521301269531, "logits/rejected": -1.4636070728302002, "logps/chosen": -44.76102828979492, "logps/rejected": -47.91448211669922, "loss": 0.6868, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.005100742448121309, "rewards/margins": 0.012960699386894703, "rewards/rejected": -0.007859956473112106, "step": 2110 }, { "epoch": 1.527377521613833, "grad_norm": 3.2338225841522217, "learning_rate": 2.8561643164513637e-08, "logits/chosen": -1.3346589803695679, "logits/rejected": -1.318894386291504, "logps/chosen": -51.97467803955078, "logps/rejected": -54.27916717529297, "loss": 0.6889, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.003795693162828684, "rewards/margins": 0.008763305842876434, "rewards/rejected": -0.0049676112830638885, "step": 2120 }, { "epoch": 1.5345821325648417, "grad_norm": 3.310194253921509, "learning_rate": 2.8354051263626227e-08, "logits/chosen": -1.4670060873031616, "logits/rejected": -1.4673092365264893, "logps/chosen": -50.2790641784668, "logps/rejected": -52.83305740356445, "loss": 0.6885, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0032279961742460728, "rewards/margins": 0.00951780378818512, "rewards/rejected": -0.0062898085452616215, "step": 2130 }, { "epoch": 1.54178674351585, "grad_norm": 4.669538497924805, "learning_rate": 2.8146223586784573e-08, "logits/chosen": -1.4577587842941284, "logits/rejected": -1.4446860551834106, "logps/chosen": -52.09587860107422, "logps/rejected": -54.9838981628418, "loss": 0.6872, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.003598356619477272, "rewards/margins": 0.012193666771054268, "rewards/rejected": -0.008595308288931847, "step": 2140 }, { "epoch": 1.5489913544668588, "grad_norm": 3.397944450378418, "learning_rate": 2.7938174743416205e-08, "logits/chosen": -1.362642765045166, "logits/rejected": -1.3556791543960571, "logps/chosen": -51.507118225097656, "logps/rejected": -55.4406852722168, "loss": 0.6877, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0027351544704288244, "rewards/margins": 0.011117152869701385, "rewards/rejected": -0.008381998166441917, "step": 2150 }, { "epoch": 1.5561959654178674, "grad_norm": 3.0584213733673096, "learning_rate": 2.7729919358495728e-08, "logits/chosen": -1.5042486190795898, "logits/rejected": -1.4946848154067993, "logps/chosen": -52.332984924316406, "logps/rejected": -53.52936935424805, "loss": 0.687, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0036446847952902317, "rewards/margins": 0.012581204064190388, "rewards/rejected": -0.008936519734561443, "step": 2160 }, { "epoch": 1.563400576368876, "grad_norm": 3.8265929222106934, "learning_rate": 2.7521472071516772e-08, "logits/chosen": -1.4729335308074951, "logits/rejected": -1.4664169549942017, "logps/chosen": -43.67477035522461, "logps/rejected": -47.45640182495117, "loss": 0.6882, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.005746934562921524, "rewards/margins": 0.010086534544825554, "rewards/rejected": -0.0043396009132266045, "step": 2170 }, { "epoch": 1.5706051873198847, "grad_norm": 3.8936140537261963, "learning_rate": 2.731284753546289e-08, "logits/chosen": -1.4814698696136475, "logits/rejected": -1.4747785329818726, "logps/chosen": -53.028472900390625, "logps/rejected": -56.810150146484375, "loss": 0.6893, "rewards/accuracies": 0.59375, "rewards/chosen": -0.00028875062707811594, "rewards/margins": 0.007881390862166882, "rewards/rejected": -0.008170142769813538, "step": 2180 }, { "epoch": 1.5778097982708934, "grad_norm": 4.052019119262695, "learning_rate": 2.710406041577751e-08, "logits/chosen": -1.5514274835586548, "logits/rejected": -1.5482257604599, "logps/chosen": -47.992862701416016, "logps/rejected": -53.79439163208008, "loss": 0.6881, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.004318216349929571, "rewards/margins": 0.010280657559633255, "rewards/rejected": -0.005962441675364971, "step": 2190 }, { "epoch": 1.585014409221902, "grad_norm": 3.321918487548828, "learning_rate": 2.6895125389333017e-08, "logits/chosen": -1.537071943283081, "logits/rejected": -1.5222880840301514, "logps/chosen": -48.4653434753418, "logps/rejected": -52.60175704956055, "loss": 0.6853, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.006970447953790426, "rewards/margins": 0.015999775379896164, "rewards/rejected": -0.009029326029121876, "step": 2200 }, { "epoch": 1.5922190201729105, "grad_norm": 3.160768508911133, "learning_rate": 2.6686057143399028e-08, "logits/chosen": -1.5062367916107178, "logits/rejected": -1.4982922077178955, "logps/chosen": -48.509521484375, "logps/rejected": -50.001853942871094, "loss": 0.6882, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.004941598977893591, "rewards/margins": 0.010229108855128288, "rewards/rejected": -0.00528750941157341, "step": 2210 }, { "epoch": 1.5994236311239192, "grad_norm": 3.687347888946533, "learning_rate": 2.647687037460996e-08, "logits/chosen": -1.4847967624664307, "logits/rejected": -1.4772911071777344, "logps/chosen": -52.836029052734375, "logps/rejected": -58.44087600708008, "loss": 0.6861, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.006661810912191868, "rewards/margins": 0.014402936212718487, "rewards/rejected": -0.007741124834865332, "step": 2220 }, { "epoch": 1.6066282420749278, "grad_norm": 3.245283365249634, "learning_rate": 2.626757978793187e-08, "logits/chosen": -1.5062129497528076, "logits/rejected": -1.4990017414093018, "logps/chosen": -48.865169525146484, "logps/rejected": -52.4692497253418, "loss": 0.6888, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00014934074715711176, "rewards/margins": 0.008889252319931984, "rewards/rejected": -0.009038591757416725, "step": 2230 }, { "epoch": 1.6138328530259365, "grad_norm": 2.9634509086608887, "learning_rate": 2.6058200095628797e-08, "logits/chosen": -1.5059670209884644, "logits/rejected": -1.5062869787216187, "logps/chosen": -40.85955047607422, "logps/rejected": -46.7325325012207, "loss": 0.685, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.004564112052321434, "rewards/margins": 0.016638968139886856, "rewards/rejected": -0.012074857018887997, "step": 2240 }, { "epoch": 1.6210374639769451, "grad_norm": 3.270120143890381, "learning_rate": 2.584874601622854e-08, "logits/chosen": -1.566474199295044, "logits/rejected": -1.5492124557495117, "logps/chosen": -49.3926887512207, "logps/rejected": -53.256019592285156, "loss": 0.6888, "rewards/accuracies": 0.59375, "rewards/chosen": 0.002326581161469221, "rewards/margins": 0.008950329385697842, "rewards/rejected": -0.006623747758567333, "step": 2250 }, { "epoch": 1.6282420749279538, "grad_norm": 3.055609941482544, "learning_rate": 2.5639232273487993e-08, "logits/chosen": -1.4605876207351685, "logits/rejected": -1.4407970905303955, "logps/chosen": -44.329551696777344, "logps/rejected": -47.6479377746582, "loss": 0.6878, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.004232374019920826, "rewards/margins": 0.011020188219845295, "rewards/rejected": -0.006787814199924469, "step": 2260 }, { "epoch": 1.6354466858789625, "grad_norm": 3.6103997230529785, "learning_rate": 2.5429673595358142e-08, "logits/chosen": -1.5238250494003296, "logits/rejected": -1.5089406967163086, "logps/chosen": -45.79985809326172, "logps/rejected": -48.57173156738281, "loss": 0.6878, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0016669733449816704, "rewards/margins": 0.011048417538404465, "rewards/rejected": -0.009381445124745369, "step": 2270 }, { "epoch": 1.6426512968299711, "grad_norm": 3.2970402240753174, "learning_rate": 2.5220084712948764e-08, "logits/chosen": -1.4582303762435913, "logits/rejected": -1.4462206363677979, "logps/chosen": -52.10322952270508, "logps/rejected": -55.221832275390625, "loss": 0.6898, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0028122267685830593, "rewards/margins": 0.006987334694713354, "rewards/rejected": -0.004175108857452869, "step": 2280 }, { "epoch": 1.6498559077809798, "grad_norm": 3.7503063678741455, "learning_rate": 2.5010480359492838e-08, "logits/chosen": -1.4644200801849365, "logits/rejected": -1.4521420001983643, "logps/chosen": -49.4357795715332, "logps/rejected": -49.438568115234375, "loss": 0.686, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0039522526785731316, "rewards/margins": 0.014734050258994102, "rewards/rejected": -0.010781797580420971, "step": 2290 }, { "epoch": 1.6570605187319885, "grad_norm": 2.9756903648376465, "learning_rate": 2.480087526931091e-08, "logits/chosen": -1.5041756629943848, "logits/rejected": -1.4846851825714111, "logps/chosen": -43.37569046020508, "logps/rejected": -45.38811111450195, "loss": 0.6863, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.003738057566806674, "rewards/margins": 0.01406602282077074, "rewards/rejected": -0.010327964089810848, "step": 2300 }, { "epoch": 1.6642651296829971, "grad_norm": 3.4368813037872314, "learning_rate": 2.4591284176775326e-08, "logits/chosen": -1.44768226146698, "logits/rejected": -1.4357998371124268, "logps/chosen": -55.06958770751953, "logps/rejected": -56.48256301879883, "loss": 0.689, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0038170614279806614, "rewards/margins": 0.008528480306267738, "rewards/rejected": -0.00471141841262579, "step": 2310 }, { "epoch": 1.6714697406340058, "grad_norm": 2.8778862953186035, "learning_rate": 2.4381721815274443e-08, "logits/chosen": -1.5203001499176025, "logits/rejected": -1.5137100219726562, "logps/chosen": -43.21926498413086, "logps/rejected": -46.39662551879883, "loss": 0.687, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0016496287425979972, "rewards/margins": 0.012687856331467628, "rewards/rejected": -0.011038227006793022, "step": 2320 }, { "epoch": 1.6786743515850144, "grad_norm": 3.1286635398864746, "learning_rate": 2.4172202916176936e-08, "logits/chosen": -1.5638360977172852, "logits/rejected": -1.5550332069396973, "logps/chosen": -43.0087776184082, "logps/rejected": -47.81661605834961, "loss": 0.6861, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0009016372496262193, "rewards/margins": 0.014661784283816814, "rewards/rejected": -0.01376014668494463, "step": 2330 }, { "epoch": 1.685878962536023, "grad_norm": 3.64982271194458, "learning_rate": 2.3962742207796268e-08, "logits/chosen": -1.4485204219818115, "logits/rejected": -1.4384351968765259, "logps/chosen": -41.6459846496582, "logps/rejected": -45.627601623535156, "loss": 0.6849, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.004887954797595739, "rewards/margins": 0.016865257173776627, "rewards/rejected": -0.01197730004787445, "step": 2340 }, { "epoch": 1.6930835734870318, "grad_norm": 3.615983247756958, "learning_rate": 2.3753354414355334e-08, "logits/chosen": -1.4232791662216187, "logits/rejected": -1.400564432144165, "logps/chosen": -53.550025939941406, "logps/rejected": -55.16486358642578, "loss": 0.6875, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0003485208726488054, "rewards/margins": 0.011735951527953148, "rewards/rejected": -0.01138742920011282, "step": 2350 }, { "epoch": 1.7002881844380404, "grad_norm": 3.3526298999786377, "learning_rate": 2.3544054254951408e-08, "logits/chosen": -1.4648194313049316, "logits/rejected": -1.4448609352111816, "logps/chosen": -42.864173889160156, "logps/rejected": -48.36962127685547, "loss": 0.6845, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.004195074550807476, "rewards/margins": 0.01782260462641716, "rewards/rejected": -0.01362752728164196, "step": 2360 }, { "epoch": 1.707492795389049, "grad_norm": 3.3519835472106934, "learning_rate": 2.3334856442521435e-08, "logits/chosen": -1.5586761236190796, "logits/rejected": -1.5402878522872925, "logps/chosen": -51.24225616455078, "logps/rejected": -51.37165451049805, "loss": 0.688, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0026316409930586815, "rewards/margins": 0.01046125777065754, "rewards/rejected": -0.007829615846276283, "step": 2370 }, { "epoch": 1.7146974063400577, "grad_norm": 3.329519033432007, "learning_rate": 2.3125775682807826e-08, "logits/chosen": -1.5537148714065552, "logits/rejected": -1.5523698329925537, "logps/chosen": -49.91655349731445, "logps/rejected": -53.717132568359375, "loss": 0.6862, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0029881394002586603, "rewards/margins": 0.014401605352759361, "rewards/rejected": -0.01141346711665392, "step": 2380 }, { "epoch": 1.7219020172910664, "grad_norm": 2.7364284992218018, "learning_rate": 2.291682667332464e-08, "logits/chosen": -1.6134541034698486, "logits/rejected": -1.5998890399932861, "logps/chosen": -46.44415283203125, "logps/rejected": -49.58192825317383, "loss": 0.6887, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0009153697756119072, "rewards/margins": 0.009135574102401733, "rewards/rejected": -0.008220205083489418, "step": 2390 }, { "epoch": 1.729106628242075, "grad_norm": 2.911412477493286, "learning_rate": 2.2708024102324454e-08, "logits/chosen": -1.5305463075637817, "logits/rejected": -1.5252015590667725, "logps/chosen": -46.698795318603516, "logps/rejected": -51.687110900878906, "loss": 0.6853, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0034352089278399944, "rewards/margins": 0.016118764877319336, "rewards/rejected": -0.012683555483818054, "step": 2400 }, { "epoch": 1.7363112391930837, "grad_norm": 3.7019219398498535, "learning_rate": 2.2499382647765797e-08, "logits/chosen": -1.4968922138214111, "logits/rejected": -1.4982818365097046, "logps/chosen": -48.41173553466797, "logps/rejected": -51.977378845214844, "loss": 0.6876, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 8.274018910015002e-05, "rewards/margins": 0.011399459093809128, "rewards/rejected": -0.01131671853363514, "step": 2410 }, { "epoch": 1.7435158501440924, "grad_norm": 2.8830807209014893, "learning_rate": 2.2290916976281427e-08, "logits/chosen": -1.4777326583862305, "logits/rejected": -1.4632737636566162, "logps/chosen": -43.695831298828125, "logps/rejected": -46.05553436279297, "loss": 0.6863, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.00010001249756896868, "rewards/margins": 0.014169926755130291, "rewards/rejected": -0.014269940555095673, "step": 2420 }, { "epoch": 1.7507204610951008, "grad_norm": 3.572896957397461, "learning_rate": 2.2082641742147238e-08, "logits/chosen": -1.4719898700714111, "logits/rejected": -1.4631872177124023, "logps/chosen": -45.68647766113281, "logps/rejected": -51.5733757019043, "loss": 0.6867, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0008903827401809394, "rewards/margins": 0.013410898856818676, "rewards/rejected": -0.01252051629126072, "step": 2430 }, { "epoch": 1.7579250720461095, "grad_norm": 3.098632574081421, "learning_rate": 2.1874571586252177e-08, "logits/chosen": -1.5461061000823975, "logits/rejected": -1.534361481666565, "logps/chosen": -45.58122634887695, "logps/rejected": -48.32319259643555, "loss": 0.6878, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0007744136964902282, "rewards/margins": 0.010953008197247982, "rewards/rejected": -0.010178593918681145, "step": 2440 }, { "epoch": 1.7651296829971181, "grad_norm": 2.4132089614868164, "learning_rate": 2.1666721135069037e-08, "logits/chosen": -1.515826940536499, "logits/rejected": -1.5021852254867554, "logps/chosen": -49.859291076660156, "logps/rejected": -51.296424865722656, "loss": 0.6876, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0015759628731757402, "rewards/margins": 0.011497067287564278, "rewards/rejected": -0.009921105578541756, "step": 2450 }, { "epoch": 1.7723342939481268, "grad_norm": 2.642758846282959, "learning_rate": 2.145910499962628e-08, "logits/chosen": -1.5767768621444702, "logits/rejected": -1.555633544921875, "logps/chosen": -43.99817657470703, "logps/rejected": -46.1553955078125, "loss": 0.6847, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.0037842721212655306, "rewards/margins": 0.017482485622167587, "rewards/rejected": -0.013698210939764977, "step": 2460 }, { "epoch": 1.7795389048991355, "grad_norm": 3.895792245864868, "learning_rate": 2.1251737774480915e-08, "logits/chosen": -1.5495898723602295, "logits/rejected": -1.5402934551239014, "logps/chosen": -53.23638916015625, "logps/rejected": -55.35784912109375, "loss": 0.6872, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0016384575283154845, "rewards/margins": 0.012212954461574554, "rewards/rejected": -0.010574499145150185, "step": 2470 }, { "epoch": 1.7867435158501441, "grad_norm": 2.5514557361602783, "learning_rate": 2.104463403669264e-08, "logits/chosen": -1.4776699542999268, "logits/rejected": -1.4571102857589722, "logps/chosen": -49.018821716308594, "logps/rejected": -51.24462127685547, "loss": 0.6864, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0007808968657627702, "rewards/margins": 0.013993730768561363, "rewards/rejected": -0.013212834484875202, "step": 2480 }, { "epoch": 1.7939481268011528, "grad_norm": 2.6605591773986816, "learning_rate": 2.0837808344799028e-08, "logits/chosen": -1.452739953994751, "logits/rejected": -1.4367029666900635, "logps/chosen": -43.84040832519531, "logps/rejected": -47.514102935791016, "loss": 0.6839, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.007030940148979425, "rewards/margins": 0.01895783096551895, "rewards/rejected": -0.011926891282200813, "step": 2490 }, { "epoch": 1.8011527377521612, "grad_norm": 3.2000277042388916, "learning_rate": 2.063127523779219e-08, "logits/chosen": -1.4303696155548096, "logits/rejected": -1.429574728012085, "logps/chosen": -44.83720016479492, "logps/rejected": -51.28551483154297, "loss": 0.6836, "rewards/accuracies": 0.71875, "rewards/chosen": 0.003173623699694872, "rewards/margins": 0.01957201212644577, "rewards/rejected": -0.016398390755057335, "step": 2500 }, { "epoch": 1.8083573487031699, "grad_norm": 3.828165054321289, "learning_rate": 2.0425049234096737e-08, "logits/chosen": -1.486255407333374, "logits/rejected": -1.4714888334274292, "logps/chosen": -49.1246452331543, "logps/rejected": -51.77213668823242, "loss": 0.6863, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 7.560032099718228e-05, "rewards/margins": 0.014201045036315918, "rewards/rejected": -0.014125445857644081, "step": 2510 }, { "epoch": 1.8155619596541785, "grad_norm": 2.640105962753296, "learning_rate": 2.0219144830549163e-08, "logits/chosen": -1.4610233306884766, "logits/rejected": -1.4512735605239868, "logps/chosen": -48.97557067871094, "logps/rejected": -52.63092803955078, "loss": 0.6849, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.001075071981176734, "rewards/margins": 0.017014745622873306, "rewards/rejected": -0.015939675271511078, "step": 2520 }, { "epoch": 1.8227665706051872, "grad_norm": 2.893141269683838, "learning_rate": 2.0013576501378823e-08, "logits/chosen": -1.4366611242294312, "logits/rejected": -1.4269273281097412, "logps/chosen": -44.668251037597656, "logps/rejected": -48.838382720947266, "loss": 0.6818, "rewards/accuracies": 0.65625, "rewards/chosen": 0.007614838890731335, "rewards/margins": 0.023311858996748924, "rewards/rejected": -0.015697021037340164, "step": 2530 }, { "epoch": 1.8299711815561959, "grad_norm": 3.5534820556640625, "learning_rate": 1.9808358697190426e-08, "logits/chosen": -1.462631344795227, "logits/rejected": -1.4623886346817017, "logps/chosen": -39.994834899902344, "logps/rejected": -45.32501220703125, "loss": 0.6846, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0002226762444479391, "rewards/margins": 0.017688129097223282, "rewards/rejected": -0.017910804599523544, "step": 2540 }, { "epoch": 1.8371757925072045, "grad_norm": 3.05342698097229, "learning_rate": 1.9603505843948214e-08, "logits/chosen": -1.4901165962219238, "logits/rejected": -1.4695804119110107, "logps/chosen": -41.027496337890625, "logps/rejected": -46.306419372558594, "loss": 0.6864, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0012056769337505102, "rewards/margins": 0.013855445198714733, "rewards/rejected": -0.012649768963456154, "step": 2550 }, { "epoch": 1.8443804034582132, "grad_norm": 3.029517412185669, "learning_rate": 1.9399032341961886e-08, "logits/chosen": -1.4612455368041992, "logits/rejected": -1.4414364099502563, "logps/chosen": -44.05613708496094, "logps/rejected": -45.91584777832031, "loss": 0.6869, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0033496886026114225, "rewards/margins": 0.012948046438395977, "rewards/rejected": -0.009598356671631336, "step": 2560 }, { "epoch": 1.8515850144092219, "grad_norm": 3.660400867462158, "learning_rate": 1.9194952564874323e-08, "logits/chosen": -1.490746259689331, "logits/rejected": -1.478116750717163, "logps/chosen": -49.373924255371094, "logps/rejected": -52.70782470703125, "loss": 0.686, "rewards/accuracies": 0.625, "rewards/chosen": 0.0015121384058147669, "rewards/margins": 0.014797654934227467, "rewards/rejected": -0.013285515829920769, "step": 2570 }, { "epoch": 1.8587896253602305, "grad_norm": 2.9589059352874756, "learning_rate": 1.8991280858651157e-08, "logits/chosen": -1.4661362171173096, "logits/rejected": -1.4451956748962402, "logps/chosen": -48.044097900390625, "logps/rejected": -49.68541717529297, "loss": 0.6863, "rewards/accuracies": 0.5625, "rewards/chosen": 0.000995612470433116, "rewards/margins": 0.014148500747978687, "rewards/rejected": -0.013152887113392353, "step": 2580 }, { "epoch": 1.8659942363112392, "grad_norm": 3.7703192234039307, "learning_rate": 1.8788031540572327e-08, "logits/chosen": -1.4331156015396118, "logits/rejected": -1.4193694591522217, "logps/chosen": -43.3116569519043, "logps/rejected": -47.158424377441406, "loss": 0.6848, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0016503268852829933, "rewards/margins": 0.01719464734196663, "rewards/rejected": -0.015544322319328785, "step": 2590 }, { "epoch": 1.8731988472622478, "grad_norm": 3.501920700073242, "learning_rate": 1.858521889822565e-08, "logits/chosen": -1.481728434562683, "logits/rejected": -1.4720659255981445, "logps/chosen": -44.771629333496094, "logps/rejected": -47.331321716308594, "loss": 0.6875, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.00216684746555984, "rewards/margins": 0.01176757737994194, "rewards/rejected": -0.009600730612874031, "step": 2600 }, { "epoch": 1.8804034582132565, "grad_norm": 3.0656304359436035, "learning_rate": 1.8382857188502422e-08, "logits/chosen": -1.480365514755249, "logits/rejected": -1.4653451442718506, "logps/chosen": -43.37881851196289, "logps/rejected": -46.211647033691406, "loss": 0.6857, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.001201205188408494, "rewards/margins": 0.015598386526107788, "rewards/rejected": -0.014397179707884789, "step": 2610 }, { "epoch": 1.8876080691642652, "grad_norm": 3.0691475868225098, "learning_rate": 1.8180960636595234e-08, "logits/chosen": -1.4347946643829346, "logits/rejected": -1.4240456819534302, "logps/chosen": -45.46348190307617, "logps/rejected": -48.79424285888672, "loss": 0.6844, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.000794556166511029, "rewards/margins": 0.01804800145328045, "rewards/rejected": -0.017253447324037552, "step": 2620 }, { "epoch": 1.8948126801152738, "grad_norm": 2.6301233768463135, "learning_rate": 1.7979543434998015e-08, "logits/chosen": -1.5175861120224, "logits/rejected": -1.5130423307418823, "logps/chosen": -54.06511688232422, "logps/rejected": -55.68177032470703, "loss": 0.6888, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0019837587606161833, "rewards/margins": 0.009002977050840855, "rewards/rejected": -0.010986736044287682, "step": 2630 }, { "epoch": 1.9020172910662825, "grad_norm": 3.1905744075775146, "learning_rate": 1.7778619742508345e-08, "logits/chosen": -1.4991130828857422, "logits/rejected": -1.4788715839385986, "logps/chosen": -48.78097152709961, "logps/rejected": -50.549293518066406, "loss": 0.6862, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0013503096997737885, "rewards/margins": 0.01445526909083128, "rewards/rejected": -0.015805575996637344, "step": 2640 }, { "epoch": 1.9092219020172911, "grad_norm": 5.37537145614624, "learning_rate": 1.757820368323213e-08, "logits/chosen": -1.4480946063995361, "logits/rejected": -1.431839108467102, "logps/chosen": -55.62885284423828, "logps/rejected": -60.84233856201172, "loss": 0.6854, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00024756681523285806, "rewards/margins": 0.01604391634464264, "rewards/rejected": -0.016291480511426926, "step": 2650 }, { "epoch": 1.9164265129682998, "grad_norm": 2.7058212757110596, "learning_rate": 1.7378309345590803e-08, "logits/chosen": -1.5186960697174072, "logits/rejected": -1.521729826927185, "logps/chosen": -48.12525177001953, "logps/rejected": -51.73346710205078, "loss": 0.6859, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0006261245580390096, "rewards/margins": 0.014795279130339622, "rewards/rejected": -0.01416915375739336, "step": 2660 }, { "epoch": 1.9236311239193085, "grad_norm": 3.0411376953125, "learning_rate": 1.717895078133088e-08, "logits/chosen": -1.537657618522644, "logits/rejected": -1.5279037952423096, "logps/chosen": -45.69334411621094, "logps/rejected": -50.841529846191406, "loss": 0.6846, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.001616945257410407, "rewards/margins": 0.017612287774682045, "rewards/rejected": -0.015995342284440994, "step": 2670 }, { "epoch": 1.9308357348703171, "grad_norm": 2.9507884979248047, "learning_rate": 1.698014200453624e-08, "logits/chosen": -1.512743353843689, "logits/rejected": -1.516052007675171, "logps/chosen": -48.50223922729492, "logps/rejected": -53.1720085144043, "loss": 0.6894, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0005373925669118762, "rewards/margins": 0.007933690212666988, "rewards/rejected": -0.007396298460662365, "step": 2680 }, { "epoch": 1.9380403458213258, "grad_norm": 3.1136631965637207, "learning_rate": 1.6781896990642964e-08, "logits/chosen": -1.4173403978347778, "logits/rejected": -1.4077566862106323, "logps/chosen": -53.65788650512695, "logps/rejected": -55.55950927734375, "loss": 0.6876, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0008957240497693419, "rewards/margins": 0.011654635891318321, "rewards/rejected": -0.010758912190794945, "step": 2690 }, { "epoch": 1.9452449567723344, "grad_norm": 3.702500343322754, "learning_rate": 1.658422967545693e-08, "logits/chosen": -1.545461893081665, "logits/rejected": -1.5236259698867798, "logps/chosen": -46.60261917114258, "logps/rejected": -48.830589294433594, "loss": 0.6866, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0023856277111917734, "rewards/margins": 0.013566548936069012, "rewards/rejected": -0.015952177345752716, "step": 2700 }, { "epoch": 1.952449567723343, "grad_norm": 3.34963059425354, "learning_rate": 1.638715395417418e-08, "logits/chosen": -1.5168092250823975, "logits/rejected": -1.5006424188613892, "logps/chosen": -47.71814727783203, "logps/rejected": -50.16883087158203, "loss": 0.6873, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0012928284704685211, "rewards/margins": 0.011976560577750206, "rewards/rejected": -0.013269389048218727, "step": 2710 }, { "epoch": 1.9596541786743515, "grad_norm": 3.404601573944092, "learning_rate": 1.619068368040416e-08, "logits/chosen": -1.5042836666107178, "logits/rejected": -1.4942996501922607, "logps/chosen": -42.337650299072266, "logps/rejected": -47.94048309326172, "loss": 0.6849, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.0009236250771209598, "rewards/margins": 0.016973894089460373, "rewards/rejected": -0.016050271689891815, "step": 2720 }, { "epoch": 1.9668587896253602, "grad_norm": 3.3033130168914795, "learning_rate": 1.5994832665195853e-08, "logits/chosen": -1.4348411560058594, "logits/rejected": -1.4287351369857788, "logps/chosen": -46.385520935058594, "logps/rejected": -48.763214111328125, "loss": 0.6874, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0006011429941281676, "rewards/margins": 0.011924292892217636, "rewards/rejected": -0.01132314931601286, "step": 2730 }, { "epoch": 1.9740634005763689, "grad_norm": 3.290114641189575, "learning_rate": 1.5799614676066906e-08, "logits/chosen": -1.5618253946304321, "logits/rejected": -1.5571167469024658, "logps/chosen": -42.574127197265625, "logps/rejected": -47.01802444458008, "loss": 0.6854, "rewards/accuracies": 0.625, "rewards/chosen": -0.0009528351947665215, "rewards/margins": 0.016010040417313576, "rewards/rejected": -0.016962874680757523, "step": 2740 }, { "epoch": 1.9812680115273775, "grad_norm": 2.788604974746704, "learning_rate": 1.560504343603587e-08, "logits/chosen": -1.4581258296966553, "logits/rejected": -1.460876226425171, "logps/chosen": -47.58959197998047, "logps/rejected": -53.10105514526367, "loss": 0.6864, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0016212237533181906, "rewards/margins": 0.01379337441176176, "rewards/rejected": -0.012172150425612926, "step": 2750 }, { "epoch": 1.9884726224783862, "grad_norm": 2.688870668411255, "learning_rate": 1.541113262265748e-08, "logits/chosen": -1.5588115453720093, "logits/rejected": -1.5540285110473633, "logps/chosen": -47.810081481933594, "logps/rejected": -51.99542236328125, "loss": 0.6856, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0008094706572592258, "rewards/margins": 0.01555786095559597, "rewards/rejected": -0.01474839262664318, "step": 2760 }, { "epoch": 1.9956772334293948, "grad_norm": 2.7986905574798584, "learning_rate": 1.5217895867061227e-08, "logits/chosen": -1.4807400703430176, "logits/rejected": -1.4691799879074097, "logps/chosen": -49.02349090576172, "logps/rejected": -51.68681716918945, "loss": 0.6855, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1304626241326332e-05, "rewards/margins": 0.015814241021871567, "rewards/rejected": -0.01582554541528225, "step": 2770 }, { "epoch": 2.0028818443804033, "grad_norm": 3.2165708541870117, "learning_rate": 1.5025346752993098e-08, "logits/chosen": -1.4738900661468506, "logits/rejected": -1.4782806634902954, "logps/chosen": -47.22654724121094, "logps/rejected": -51.38325119018555, "loss": 0.6888, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.002990271197631955, "rewards/margins": 0.009079065173864365, "rewards/rejected": -0.012069336138665676, "step": 2780 }, { "epoch": 2.010086455331412, "grad_norm": 3.1721432209014893, "learning_rate": 1.4833498815860756e-08, "logits/chosen": -1.6040818691253662, "logits/rejected": -1.5952320098876953, "logps/chosen": -44.67755889892578, "logps/rejected": -49.35480499267578, "loss": 0.6835, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0012609490659087896, "rewards/margins": 0.019852185621857643, "rewards/rejected": -0.01859123632311821, "step": 2790 }, { "epoch": 2.0172910662824206, "grad_norm": 3.4663138389587402, "learning_rate": 1.4642365541781993e-08, "logits/chosen": -1.4190483093261719, "logits/rejected": -1.402295470237732, "logps/chosen": -46.372066497802734, "logps/rejected": -51.25910568237305, "loss": 0.6851, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0015671824803575873, "rewards/margins": 0.016764571890234947, "rewards/rejected": -0.018331754952669144, "step": 2800 }, { "epoch": 2.0244956772334293, "grad_norm": 3.538506507873535, "learning_rate": 1.4451960366636745e-08, "logits/chosen": -1.5058993101119995, "logits/rejected": -1.510118007659912, "logps/chosen": -50.21342086791992, "logps/rejected": -54.794654846191406, "loss": 0.6862, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.00038042213418520987, "rewards/margins": 0.014330941252410412, "rewards/rejected": -0.013950521126389503, "step": 2810 }, { "epoch": 2.031700288184438, "grad_norm": 2.9821181297302246, "learning_rate": 1.4262296675122592e-08, "logits/chosen": -1.506753921508789, "logits/rejected": -1.491409420967102, "logps/chosen": -43.888484954833984, "logps/rejected": -48.556732177734375, "loss": 0.685, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.00017104865401051939, "rewards/margins": 0.01672585867345333, "rewards/rejected": -0.01689690724015236, "step": 2820 }, { "epoch": 2.0389048991354466, "grad_norm": 3.436539888381958, "learning_rate": 1.407338779981389e-08, "logits/chosen": -1.4747148752212524, "logits/rejected": -1.4630435705184937, "logps/chosen": -41.41279602050781, "logps/rejected": -46.371131896972656, "loss": 0.6833, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0008243538322858512, "rewards/margins": 0.02022518590092659, "rewards/rejected": -0.021049540489912033, "step": 2830 }, { "epoch": 2.0461095100864553, "grad_norm": 3.1391549110412598, "learning_rate": 1.3885247020224534e-08, "logits/chosen": -1.4744846820831299, "logits/rejected": -1.4637953042984009, "logps/chosen": -40.93677520751953, "logps/rejected": -44.24010467529297, "loss": 0.6836, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0011112012434750795, "rewards/margins": 0.01986369863152504, "rewards/rejected": -0.01875249855220318, "step": 2840 }, { "epoch": 2.053314121037464, "grad_norm": 2.7591843605041504, "learning_rate": 1.369788756187445e-08, "logits/chosen": -1.5235010385513306, "logits/rejected": -1.5118696689605713, "logps/chosen": -46.81077194213867, "logps/rejected": -48.06627655029297, "loss": 0.6879, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.002389531582593918, "rewards/margins": 0.011110137216746807, "rewards/rejected": -0.0134996697306633, "step": 2850 }, { "epoch": 2.0605187319884726, "grad_norm": 3.157299518585205, "learning_rate": 1.3511322595359925e-08, "logits/chosen": -1.532439112663269, "logits/rejected": -1.5212130546569824, "logps/chosen": -43.265541076660156, "logps/rejected": -48.95451354980469, "loss": 0.684, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0006747387815266848, "rewards/margins": 0.01883287914097309, "rewards/rejected": -0.019507618620991707, "step": 2860 }, { "epoch": 2.0677233429394812, "grad_norm": 3.223790407180786, "learning_rate": 1.3325565235427716e-08, "logits/chosen": -1.5531320571899414, "logits/rejected": -1.5446122884750366, "logps/chosen": -45.25885772705078, "logps/rejected": -49.23384475708008, "loss": 0.6846, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0007360210875049233, "rewards/margins": 0.017545271664857864, "rewards/rejected": -0.01828129217028618, "step": 2870 }, { "epoch": 2.07492795389049, "grad_norm": 3.642242670059204, "learning_rate": 1.3140628540053218e-08, "logits/chosen": -1.4586588144302368, "logits/rejected": -1.45664381980896, "logps/chosen": -45.87567138671875, "logps/rejected": -49.37241744995117, "loss": 0.6856, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.002558287465944886, "rewards/margins": 0.015417991206049919, "rewards/rejected": -0.012859704904258251, "step": 2880 }, { "epoch": 2.0821325648414986, "grad_norm": 3.986879587173462, "learning_rate": 1.2956525509522451e-08, "logits/chosen": -1.4353663921356201, "logits/rejected": -1.4404122829437256, "logps/chosen": -47.817115783691406, "logps/rejected": -51.359764099121094, "loss": 0.6874, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.001873530214652419, "rewards/margins": 0.011945443227887154, "rewards/rejected": -0.010071912780404091, "step": 2890 }, { "epoch": 2.089337175792507, "grad_norm": 3.8390724658966064, "learning_rate": 1.2773269085518267e-08, "logits/chosen": -1.516808032989502, "logits/rejected": -1.5112401247024536, "logps/chosen": -52.496673583984375, "logps/rejected": -56.082305908203125, "loss": 0.6867, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0004946159897372127, "rewards/margins": 0.013363862410187721, "rewards/rejected": -0.012869248166680336, "step": 2900 }, { "epoch": 2.096541786743516, "grad_norm": 2.6431021690368652, "learning_rate": 1.2590872150210574e-08, "logits/chosen": -1.5927358865737915, "logits/rejected": -1.5763109922409058, "logps/chosen": -45.6403694152832, "logps/rejected": -47.73447799682617, "loss": 0.684, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.004319643136113882, "rewards/margins": 0.019127164036035538, "rewards/rejected": -0.023446807637810707, "step": 2910 }, { "epoch": 2.1037463976945245, "grad_norm": 2.831928014755249, "learning_rate": 1.2409347525350775e-08, "logits/chosen": -1.5002295970916748, "logits/rejected": -1.481815218925476, "logps/chosen": -47.42546844482422, "logps/rejected": -51.29052734375, "loss": 0.6834, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0010362366447225213, "rewards/margins": 0.02007184363901615, "rewards/rejected": -0.01903560571372509, "step": 2920 }, { "epoch": 2.110951008645533, "grad_norm": 3.4328200817108154, "learning_rate": 1.2228707971370421e-08, "logits/chosen": -1.497158169746399, "logits/rejected": -1.4785023927688599, "logps/chosen": -42.02567672729492, "logps/rejected": -44.48183059692383, "loss": 0.6841, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.002565313596278429, "rewards/margins": 0.01863184943795204, "rewards/rejected": -0.01606653444468975, "step": 2930 }, { "epoch": 2.118155619596542, "grad_norm": 4.378489017486572, "learning_rate": 1.2048966186484282e-08, "logits/chosen": -1.5265371799468994, "logits/rejected": -1.496654987335205, "logps/chosen": -52.09857940673828, "logps/rejected": -54.9652099609375, "loss": 0.6863, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0012157706078141928, "rewards/margins": 0.014177958481013775, "rewards/rejected": -0.01539373118430376, "step": 2940 }, { "epoch": 2.1253602305475505, "grad_norm": 3.2968902587890625, "learning_rate": 1.187013480579762e-08, "logits/chosen": -1.4905649423599243, "logits/rejected": -1.4843100309371948, "logps/chosen": -45.35689163208008, "logps/rejected": -49.30203628540039, "loss": 0.6843, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.004151071421802044, "rewards/margins": 0.01855386421084404, "rewards/rejected": -0.02270493470132351, "step": 2950 }, { "epoch": 2.132564841498559, "grad_norm": 4.340986251831055, "learning_rate": 1.1692226400418073e-08, "logits/chosen": -1.4142036437988281, "logits/rejected": -1.406091332435608, "logps/chosen": -48.97296142578125, "logps/rejected": -51.94805145263672, "loss": 0.6856, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0036544338800013065, "rewards/margins": 0.015584975481033325, "rewards/rejected": -0.019239408895373344, "step": 2960 }, { "epoch": 2.139769452449568, "grad_norm": 2.482980966567993, "learning_rate": 1.1515253476571923e-08, "logits/chosen": -1.4493087530136108, "logits/rejected": -1.4434731006622314, "logps/chosen": -44.389686584472656, "logps/rejected": -50.919029235839844, "loss": 0.6844, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.003044532146304846, "rewards/margins": 0.018028805032372475, "rewards/rejected": -0.021073337644338608, "step": 2970 }, { "epoch": 2.1469740634005765, "grad_norm": 3.3252360820770264, "learning_rate": 1.133922847472496e-08, "logits/chosen": -1.4918544292449951, "logits/rejected": -1.4879610538482666, "logps/chosen": -52.49553298950195, "logps/rejected": -54.95599365234375, "loss": 0.6852, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00021512620151042938, "rewards/margins": 0.01650981977581978, "rewards/rejected": -0.016724945977330208, "step": 2980 }, { "epoch": 2.154178674351585, "grad_norm": 3.3002827167510986, "learning_rate": 1.1164163768707952e-08, "logits/chosen": -1.4666965007781982, "logits/rejected": -1.4556134939193726, "logps/chosen": -47.383094787597656, "logps/rejected": -51.5208740234375, "loss": 0.6828, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.00024200770712923259, "rewards/margins": 0.021655386313796043, "rewards/rejected": -0.021897396072745323, "step": 2990 }, { "epoch": 2.161383285302594, "grad_norm": 3.350294589996338, "learning_rate": 1.0990071664846861e-08, "logits/chosen": -1.4408986568450928, "logits/rejected": -1.4307146072387695, "logps/chosen": -48.706851959228516, "logps/rejected": -53.8525276184082, "loss": 0.6822, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.0007291844231076539, "rewards/margins": 0.02262338064610958, "rewards/rejected": -0.021894199773669243, "step": 3000 }, { "epoch": 2.1685878962536025, "grad_norm": 3.0558321475982666, "learning_rate": 1.0816964401097739e-08, "logits/chosen": -1.4839801788330078, "logits/rejected": -1.473850965499878, "logps/chosen": -42.997833251953125, "logps/rejected": -45.74707794189453, "loss": 0.6859, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0003010531945619732, "rewards/margins": 0.015322072431445122, "rewards/rejected": -0.015623128041625023, "step": 3010 }, { "epoch": 2.175792507204611, "grad_norm": 3.894000768661499, "learning_rate": 1.0644854146186406e-08, "logits/chosen": -1.5161851644515991, "logits/rejected": -1.4984285831451416, "logps/chosen": -48.15696716308594, "logps/rejected": -52.829368591308594, "loss": 0.6831, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.001691700192168355, "rewards/margins": 0.021001458168029785, "rewards/rejected": -0.02269316092133522, "step": 3020 }, { "epoch": 2.18299711815562, "grad_norm": 3.200721025466919, "learning_rate": 1.0473752998753114e-08, "logits/chosen": -1.4961122274398804, "logits/rejected": -1.4744278192520142, "logps/chosen": -48.510780334472656, "logps/rejected": -51.6027717590332, "loss": 0.6826, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0015695259207859635, "rewards/margins": 0.021745964884757996, "rewards/rejected": -0.020176438614726067, "step": 3030 }, { "epoch": 2.1902017291066285, "grad_norm": 2.9486958980560303, "learning_rate": 1.030367298650201e-08, "logits/chosen": -1.494507074356079, "logits/rejected": -1.4939751625061035, "logps/chosen": -48.71635818481445, "logps/rejected": -53.54632568359375, "loss": 0.6874, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.003486091736704111, "rewards/margins": 0.01207701861858368, "rewards/rejected": -0.015563109889626503, "step": 3040 }, { "epoch": 2.1974063400576367, "grad_norm": 3.8673534393310547, "learning_rate": 1.0134626065355675e-08, "logits/chosen": -1.5955169200897217, "logits/rejected": -1.5843254327774048, "logps/chosen": -49.311912536621094, "logps/rejected": -52.8543586730957, "loss": 0.6827, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0020087913144379854, "rewards/margins": 0.02148135006427765, "rewards/rejected": -0.01947256177663803, "step": 3050 }, { "epoch": 2.2046109510086453, "grad_norm": 3.485724449157715, "learning_rate": 9.966624118614611e-09, "logits/chosen": -1.4923583269119263, "logits/rejected": -1.4730615615844727, "logps/chosen": -52.26487350463867, "logps/rejected": -55.42426681518555, "loss": 0.6839, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0026403036899864674, "rewards/margins": 0.0191368255764246, "rewards/rejected": -0.01649652048945427, "step": 3060 }, { "epoch": 2.211815561959654, "grad_norm": 2.4057836532592773, "learning_rate": 9.799678956121976e-09, "logits/chosen": -1.4374769926071167, "logits/rejected": -1.4209473133087158, "logps/chosen": -45.876285552978516, "logps/rejected": -48.4627799987793, "loss": 0.6872, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0018981884932145476, "rewards/margins": 0.012246804311871529, "rewards/rejected": -0.01414499245584011, "step": 3070 }, { "epoch": 2.2190201729106627, "grad_norm": 3.5483055114746094, "learning_rate": 9.633802313433314e-09, "logits/chosen": -1.4146339893341064, "logits/rejected": -1.4106262922286987, "logps/chosen": -48.35334014892578, "logps/rejected": -50.88026809692383, "loss": 0.6853, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0011129272170364857, "rewards/margins": 0.016139192506670952, "rewards/rejected": -0.017252117395401, "step": 3080 }, { "epoch": 2.2262247838616713, "grad_norm": 2.770353317260742, "learning_rate": 9.469005850991705e-09, "logits/chosen": -1.4854236841201782, "logits/rejected": -1.4725892543792725, "logps/chosen": -47.15768051147461, "logps/rejected": -48.600120544433594, "loss": 0.6845, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0014169791247695684, "rewards/margins": 0.01794120855629444, "rewards/rejected": -0.019358184188604355, "step": 3090 }, { "epoch": 2.23342939481268, "grad_norm": 3.1680026054382324, "learning_rate": 9.305301153307949e-09, "logits/chosen": -1.4969298839569092, "logits/rejected": -1.5010452270507812, "logps/chosen": -39.92610549926758, "logps/rejected": -43.93165969848633, "loss": 0.6835, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.004226221702992916, "rewards/margins": 0.019832659512758255, "rewards/rejected": -0.024058884009718895, "step": 3100 }, { "epoch": 2.2406340057636887, "grad_norm": 2.740999937057495, "learning_rate": 9.142699728146336e-09, "logits/chosen": -1.4348478317260742, "logits/rejected": -1.426283597946167, "logps/chosen": -46.04469299316406, "logps/rejected": -50.985870361328125, "loss": 0.6845, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.00199733953922987, "rewards/margins": 0.01793844625353813, "rewards/rejected": -0.019935782998800278, "step": 3110 }, { "epoch": 2.2478386167146973, "grad_norm": 2.9771533012390137, "learning_rate": 8.981213005715627e-09, "logits/chosen": -1.5030255317687988, "logits/rejected": -1.5023804903030396, "logps/chosen": -44.155548095703125, "logps/rejected": -48.9468994140625, "loss": 0.6844, "rewards/accuracies": 0.625, "rewards/chosen": -0.0017275598365813494, "rewards/margins": 0.017975138500332832, "rewards/rejected": -0.019702699035406113, "step": 3120 }, { "epoch": 2.255043227665706, "grad_norm": 3.6485941410064697, "learning_rate": 8.820852337865611e-09, "logits/chosen": -1.5533548593521118, "logits/rejected": -1.537881851196289, "logps/chosen": -45.01945114135742, "logps/rejected": -48.55274200439453, "loss": 0.6847, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.000804073759354651, "rewards/margins": 0.01763494312763214, "rewards/rejected": -0.018439019098877907, "step": 3130 }, { "epoch": 2.2622478386167146, "grad_norm": 2.8052473068237305, "learning_rate": 8.661628997289044e-09, "logits/chosen": -1.4350953102111816, "logits/rejected": -1.4218308925628662, "logps/chosen": -45.344215393066406, "logps/rejected": -49.77771759033203, "loss": 0.6842, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.000743370212148875, "rewards/margins": 0.01851017400622368, "rewards/rejected": -0.019253544509410858, "step": 3140 }, { "epoch": 2.2694524495677233, "grad_norm": 2.799929141998291, "learning_rate": 8.503554176729341e-09, "logits/chosen": -1.4219027757644653, "logits/rejected": -1.4167585372924805, "logps/chosen": -45.40351104736328, "logps/rejected": -48.951255798339844, "loss": 0.6836, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0013620567042380571, "rewards/margins": 0.019856946542859077, "rewards/rejected": -0.018494892865419388, "step": 3150 }, { "epoch": 2.276657060518732, "grad_norm": 3.7959582805633545, "learning_rate": 8.346638988193636e-09, "logits/chosen": -1.4739030599594116, "logits/rejected": -1.4716893434524536, "logps/chosen": -40.54225158691406, "logps/rejected": -46.273109436035156, "loss": 0.6838, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0006361313280649483, "rewards/margins": 0.019271841272711754, "rewards/rejected": -0.01990797556936741, "step": 3160 }, { "epoch": 2.2838616714697406, "grad_norm": 4.178712368011475, "learning_rate": 8.19089446217176e-09, "logits/chosen": -1.4299513101577759, "logits/rejected": -1.4083366394042969, "logps/chosen": -45.69487380981445, "logps/rejected": -51.08367919921875, "loss": 0.68, "rewards/accuracies": 0.625, "rewards/chosen": 0.002962021389976144, "rewards/margins": 0.027211204171180725, "rewards/rejected": -0.024249184876680374, "step": 3170 }, { "epoch": 2.2910662824207493, "grad_norm": 3.0471279621124268, "learning_rate": 8.036331546860777e-09, "logits/chosen": -1.457237958908081, "logits/rejected": -1.45479154586792, "logps/chosen": -45.34724044799805, "logps/rejected": -48.17335891723633, "loss": 0.6876, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0030756318010389805, "rewards/margins": 0.011720901355147362, "rewards/rejected": -0.014796535484492779, "step": 3180 }, { "epoch": 2.298270893371758, "grad_norm": 3.6594536304473877, "learning_rate": 7.882961107395416e-09, "logits/chosen": -1.4986907243728638, "logits/rejected": -1.4889423847198486, "logps/chosen": -52.23211669921875, "logps/rejected": -52.596961975097656, "loss": 0.687, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0050607905723154545, "rewards/margins": 0.01296902447938919, "rewards/rejected": -0.01802981272339821, "step": 3190 }, { "epoch": 2.3054755043227666, "grad_norm": 4.586586952209473, "learning_rate": 7.73079392508428e-09, "logits/chosen": -1.4226272106170654, "logits/rejected": -1.4266163110733032, "logps/chosen": -49.690284729003906, "logps/rejected": -56.577552795410156, "loss": 0.6827, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0031807306222617626, "rewards/margins": 0.021851547062397003, "rewards/rejected": -0.025032276287674904, "step": 3200 }, { "epoch": 2.3126801152737753, "grad_norm": 3.5938634872436523, "learning_rate": 7.579840696651938e-09, "logits/chosen": -1.5148307085037231, "logits/rejected": -1.5085079669952393, "logps/chosen": -42.24188232421875, "logps/rejected": -45.60350799560547, "loss": 0.6843, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.003505673259496689, "rewards/margins": 0.0183546245098114, "rewards/rejected": -0.02186029590666294, "step": 3210 }, { "epoch": 2.319884726224784, "grad_norm": 4.208233833312988, "learning_rate": 7.43011203348704e-09, "logits/chosen": -1.358338475227356, "logits/rejected": -1.353421926498413, "logps/chosen": -53.043052673339844, "logps/rejected": -53.77092361450195, "loss": 0.6859, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0048605212941765785, "rewards/margins": 0.015131947584450245, "rewards/rejected": -0.019992467015981674, "step": 3220 }, { "epoch": 2.3270893371757926, "grad_norm": 3.2215311527252197, "learning_rate": 7.281618460896344e-09, "logits/chosen": -1.4850248098373413, "logits/rejected": -1.4745427370071411, "logps/chosen": -46.2344970703125, "logps/rejected": -50.65446472167969, "loss": 0.6847, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0013599084923043847, "rewards/margins": 0.017538536339998245, "rewards/rejected": -0.01889844611287117, "step": 3230 }, { "epoch": 2.3342939481268012, "grad_norm": 2.962292194366455, "learning_rate": 7.134370417364849e-09, "logits/chosen": -1.432776689529419, "logits/rejected": -1.4246224164962769, "logps/chosen": -45.164649963378906, "logps/rejected": -47.972198486328125, "loss": 0.6868, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.006024752743542194, "rewards/margins": 0.013254925608634949, "rewards/rejected": -0.019279679283499718, "step": 3240 }, { "epoch": 2.34149855907781, "grad_norm": 4.003485202789307, "learning_rate": 6.988378253821981e-09, "logits/chosen": -1.4599487781524658, "logits/rejected": -1.4520995616912842, "logps/chosen": -51.3668098449707, "logps/rejected": -54.87144088745117, "loss": 0.6869, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00047857934259809554, "rewards/margins": 0.013025891967117786, "rewards/rejected": -0.012547312304377556, "step": 3250 }, { "epoch": 2.3487031700288186, "grad_norm": 3.05368971824646, "learning_rate": 6.8436522329140186e-09, "logits/chosen": -1.4416824579238892, "logits/rejected": -1.4479385614395142, "logps/chosen": -46.932762145996094, "logps/rejected": -50.70888900756836, "loss": 0.6856, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0023715035058557987, "rewards/margins": 0.01593146100640297, "rewards/rejected": -0.01830296590924263, "step": 3260 }, { "epoch": 2.3559077809798272, "grad_norm": 3.5134246349334717, "learning_rate": 6.700202528282603e-09, "logits/chosen": -1.4186961650848389, "logits/rejected": -1.399042010307312, "logps/chosen": -48.512840270996094, "logps/rejected": -51.45924758911133, "loss": 0.6837, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.005258539691567421, "rewards/margins": 0.01997128315269947, "rewards/rejected": -0.02522982284426689, "step": 3270 }, { "epoch": 2.363112391930836, "grad_norm": 3.723384141921997, "learning_rate": 6.558039223849668e-09, "logits/chosen": -1.5115994215011597, "logits/rejected": -1.492408037185669, "logps/chosen": -46.227333068847656, "logps/rejected": -52.55454635620117, "loss": 0.6819, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0013681629206985235, "rewards/margins": 0.023501722142100334, "rewards/rejected": -0.0248698852956295, "step": 3280 }, { "epoch": 2.3703170028818445, "grad_norm": 2.8809094429016113, "learning_rate": 6.417172313108471e-09, "logits/chosen": -1.4260962009429932, "logits/rejected": -1.4144647121429443, "logps/chosen": -44.0165901184082, "logps/rejected": -47.23214340209961, "loss": 0.6853, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00685364892706275, "rewards/margins": 0.016492731869220734, "rewards/rejected": -0.02334637939929962, "step": 3290 }, { "epoch": 2.377521613832853, "grad_norm": 2.958158254623413, "learning_rate": 6.277611698421179e-09, "logits/chosen": -1.5557560920715332, "logits/rejected": -1.5354154109954834, "logps/chosen": -39.024757385253906, "logps/rejected": -44.89704513549805, "loss": 0.6816, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.003160464344546199, "rewards/margins": 0.02409496158361435, "rewards/rejected": -0.027255425229668617, "step": 3300 }, { "epoch": 2.3847262247838614, "grad_norm": 4.754333972930908, "learning_rate": 6.139367190322714e-09, "logits/chosen": -1.4941930770874023, "logits/rejected": -1.494152307510376, "logps/chosen": -52.68627166748047, "logps/rejected": -58.06211471557617, "loss": 0.6857, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.003172674449160695, "rewards/margins": 0.015608375892043114, "rewards/rejected": -0.01878105290234089, "step": 3310 }, { "epoch": 2.39193083573487, "grad_norm": 2.502631902694702, "learning_rate": 6.002448506831171e-09, "logits/chosen": -1.4809995889663696, "logits/rejected": -1.4765173196792603, "logps/chosen": -44.01865005493164, "logps/rejected": -49.128318786621094, "loss": 0.6847, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0028991461731493473, "rewards/margins": 0.017662644386291504, "rewards/rejected": -0.020561790093779564, "step": 3320 }, { "epoch": 2.3991354466858787, "grad_norm": 3.0286099910736084, "learning_rate": 5.866865272764607e-09, "logits/chosen": -1.4967315196990967, "logits/rejected": -1.490002989768982, "logps/chosen": -46.34364318847656, "logps/rejected": -50.384918212890625, "loss": 0.6856, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.005602546967566013, "rewards/margins": 0.015632400289177895, "rewards/rejected": -0.021234950050711632, "step": 3330 }, { "epoch": 2.4063400576368874, "grad_norm": 4.623770713806152, "learning_rate": 5.7326270190645595e-09, "logits/chosen": -1.3300215005874634, "logits/rejected": -1.3248698711395264, "logps/chosen": -49.87703323364258, "logps/rejected": -51.88032913208008, "loss": 0.6849, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00470289122313261, "rewards/margins": 0.017181994393467903, "rewards/rejected": -0.02188488282263279, "step": 3340 }, { "epoch": 2.413544668587896, "grad_norm": 3.6492199897766113, "learning_rate": 5.599743182125938e-09, "logits/chosen": -1.539113163948059, "logits/rejected": -1.5383161306381226, "logps/chosen": -48.7152214050293, "logps/rejected": -54.0184326171875, "loss": 0.685, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0006119796307757497, "rewards/margins": 0.01680837571620941, "rewards/rejected": -0.01742035523056984, "step": 3350 }, { "epoch": 2.4207492795389047, "grad_norm": 3.46722412109375, "learning_rate": 5.46822310313379e-09, "logits/chosen": -1.5624011754989624, "logits/rejected": -1.5669426918029785, "logps/chosen": -49.3877067565918, "logps/rejected": -52.6993408203125, "loss": 0.6876, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00427304208278656, "rewards/margins": 0.011617867276072502, "rewards/rejected": -0.015890907496213913, "step": 3360 }, { "epoch": 2.4279538904899134, "grad_norm": 3.6487491130828857, "learning_rate": 5.33807602740658e-09, "logits/chosen": -1.557901382446289, "logits/rejected": -1.5423341989517212, "logps/chosen": -41.81071853637695, "logps/rejected": -47.327213287353516, "loss": 0.6803, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0005742600187659264, "rewards/margins": 0.026613151654601097, "rewards/rejected": -0.027187416329979897, "step": 3370 }, { "epoch": 2.435158501440922, "grad_norm": 3.7982168197631836, "learning_rate": 5.209311103746334e-09, "logits/chosen": -1.476545810699463, "logits/rejected": -1.4724234342575073, "logps/chosen": -47.061851501464844, "logps/rejected": -52.321556091308594, "loss": 0.684, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0026500283274799585, "rewards/margins": 0.01901710033416748, "rewards/rejected": -0.021667128428816795, "step": 3380 }, { "epoch": 2.4423631123919307, "grad_norm": 4.152634620666504, "learning_rate": 5.081937383795484e-09, "logits/chosen": -1.4653098583221436, "logits/rejected": -1.4548850059509277, "logps/chosen": -44.15032958984375, "logps/rejected": -48.76853942871094, "loss": 0.6824, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0003869136853609234, "rewards/margins": 0.022202041000127792, "rewards/rejected": -0.022588953375816345, "step": 3390 }, { "epoch": 2.4495677233429394, "grad_norm": 3.6726996898651123, "learning_rate": 4.955963821400599e-09, "logits/chosen": -1.5260181427001953, "logits/rejected": -1.5078346729278564, "logps/chosen": -46.817447662353516, "logps/rejected": -49.603084564208984, "loss": 0.683, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0016360729932785034, "rewards/margins": 0.021213185042142868, "rewards/rejected": -0.022849254310131073, "step": 3400 }, { "epoch": 2.456772334293948, "grad_norm": 2.7636361122131348, "learning_rate": 4.831399271982928e-09, "logits/chosen": -1.3976712226867676, "logits/rejected": -1.3814750909805298, "logps/chosen": -49.63505172729492, "logps/rejected": -52.69793701171875, "loss": 0.6833, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0008561603026464581, "rewards/margins": 0.02064884826540947, "rewards/rejected": -0.02150500938296318, "step": 3410 }, { "epoch": 2.4639769452449567, "grad_norm": 3.9705750942230225, "learning_rate": 4.708252491915951e-09, "logits/chosen": -1.5011204481124878, "logits/rejected": -1.4910115003585815, "logps/chosen": -47.02643585205078, "logps/rejected": -51.54557418823242, "loss": 0.6833, "rewards/accuracies": 0.625, "rewards/chosen": -0.0034367390908300877, "rewards/margins": 0.0207851342856884, "rewards/rejected": -0.02422187477350235, "step": 3420 }, { "epoch": 2.4711815561959654, "grad_norm": 2.888221263885498, "learning_rate": 4.58653213790981e-09, "logits/chosen": -1.4987213611602783, "logits/rejected": -1.4801981449127197, "logps/chosen": -47.3719596862793, "logps/rejected": -51.944847106933594, "loss": 0.684, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0016913721337914467, "rewards/margins": 0.019097764045000076, "rewards/rejected": -0.020789138972759247, "step": 3430 }, { "epoch": 2.478386167146974, "grad_norm": 3.370331287384033, "learning_rate": 4.466246766402773e-09, "logits/chosen": -1.4718296527862549, "logits/rejected": -1.4525142908096313, "logps/chosen": -48.6312141418457, "logps/rejected": -52.31147384643555, "loss": 0.6821, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0008567210170440376, "rewards/margins": 0.022917402908205986, "rewards/rejected": -0.023774122819304466, "step": 3440 }, { "epoch": 2.4855907780979827, "grad_norm": 3.688567638397217, "learning_rate": 4.347404832959775e-09, "logits/chosen": -1.5265666246414185, "logits/rejected": -1.515133261680603, "logps/chosen": -44.58491516113281, "logps/rejected": -48.72705841064453, "loss": 0.6843, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.00400127936154604, "rewards/margins": 0.018310952931642532, "rewards/rejected": -0.022312233224511147, "step": 3450 }, { "epoch": 2.4927953890489913, "grad_norm": 3.47495436668396, "learning_rate": 4.230014691678016e-09, "logits/chosen": -1.4787131547927856, "logits/rejected": -1.4799821376800537, "logps/chosen": -49.35828399658203, "logps/rejected": -51.017032623291016, "loss": 0.6868, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.005290646571666002, "rewards/margins": 0.013366172090172768, "rewards/rejected": -0.018656820058822632, "step": 3460 }, { "epoch": 2.5, "grad_norm": 3.119842529296875, "learning_rate": 4.114084594599707e-09, "logits/chosen": -1.4643129110336304, "logits/rejected": -1.4411898851394653, "logps/chosen": -45.54496383666992, "logps/rejected": -51.423011779785156, "loss": 0.6825, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0023778839968144894, "rewards/margins": 0.022280005738139153, "rewards/rejected": -0.02465788833796978, "step": 3470 }, { "epoch": 2.5072046109510087, "grad_norm": 3.103360176086426, "learning_rate": 3.9996226911319546e-09, "logits/chosen": -1.481475830078125, "logits/rejected": -1.4590504169464111, "logps/chosen": -45.547027587890625, "logps/rejected": -48.61585235595703, "loss": 0.684, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.002360361162573099, "rewards/margins": 0.01899893209338188, "rewards/rejected": -0.021359292790293694, "step": 3480 }, { "epoch": 2.5144092219020173, "grad_norm": 3.3628768920898438, "learning_rate": 3.886637027473949e-09, "logits/chosen": -1.5132756233215332, "logits/rejected": -1.5091888904571533, "logps/chosen": -47.463836669921875, "logps/rejected": -51.489356994628906, "loss": 0.6839, "rewards/accuracies": 0.65625, "rewards/chosen": -0.004199695773422718, "rewards/margins": 0.018925204873085022, "rewards/rejected": -0.023124899715185165, "step": 3490 }, { "epoch": 2.521613832853026, "grad_norm": 3.1813926696777344, "learning_rate": 3.775135546051295e-09, "logits/chosen": -1.4061282873153687, "logits/rejected": -1.4064362049102783, "logps/chosen": -45.96582794189453, "logps/rejected": -50.36768341064453, "loss": 0.6823, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0037177815102040768, "rewards/margins": 0.022428225725889206, "rewards/rejected": -0.02614600956439972, "step": 3500 }, { "epoch": 2.5288184438040346, "grad_norm": 3.360513687133789, "learning_rate": 3.665126084957723e-09, "logits/chosen": -1.4706053733825684, "logits/rejected": -1.4622557163238525, "logps/chosen": -50.74425506591797, "logps/rejected": -51.06829071044922, "loss": 0.6845, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.004353958647698164, "rewards/margins": 0.018070969730615616, "rewards/rejected": -0.022424932569265366, "step": 3510 }, { "epoch": 2.5360230547550433, "grad_norm": 3.123054265975952, "learning_rate": 3.556616377404101e-09, "logits/chosen": -1.5037996768951416, "logits/rejected": -1.4918115139007568, "logps/chosen": -51.80335235595703, "logps/rejected": -55.71653366088867, "loss": 0.6822, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.006135745905339718, "rewards/margins": 0.022644102573394775, "rewards/rejected": -0.028779853135347366, "step": 3520 }, { "epoch": 2.543227665706052, "grad_norm": 3.3846046924591064, "learning_rate": 3.4496140511748125e-09, "logits/chosen": -1.4872541427612305, "logits/rejected": -1.468563437461853, "logps/chosen": -48.045196533203125, "logps/rejected": -50.846946716308594, "loss": 0.6838, "rewards/accuracies": 0.59375, "rewards/chosen": -0.006296695210039616, "rewards/margins": 0.019439449533820152, "rewards/rejected": -0.025736143812537193, "step": 3530 }, { "epoch": 2.5504322766570606, "grad_norm": 3.930396318435669, "learning_rate": 3.3441266280915427e-09, "logits/chosen": -1.4504344463348389, "logits/rejected": -1.447840929031372, "logps/chosen": -53.62995147705078, "logps/rejected": -57.090415954589844, "loss": 0.6855, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.001316823298111558, "rewards/margins": 0.01597544364631176, "rewards/rejected": -0.017292268574237823, "step": 3540 }, { "epoch": 2.5576368876080693, "grad_norm": 3.4803848266601562, "learning_rate": 3.2401615234845693e-09, "logits/chosen": -1.4945684671401978, "logits/rejected": -1.4767378568649292, "logps/chosen": -53.949989318847656, "logps/rejected": -57.206214904785156, "loss": 0.6823, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.006395784206688404, "rewards/margins": 0.022547531872987747, "rewards/rejected": -0.028943315148353577, "step": 3550 }, { "epoch": 2.564841498559078, "grad_norm": 3.0999386310577393, "learning_rate": 3.1377260456714375e-09, "logits/chosen": -1.3246692419052124, "logits/rejected": -1.3125396966934204, "logps/chosen": -48.97364807128906, "logps/rejected": -53.9735221862793, "loss": 0.6838, "rewards/accuracies": 0.625, "rewards/chosen": -0.006135467439889908, "rewards/margins": 0.019646648317575455, "rewards/rejected": -0.025782117620110512, "step": 3560 }, { "epoch": 2.5720461095100866, "grad_norm": 3.6545066833496094, "learning_rate": 3.0368273954432698e-09, "logits/chosen": -1.5312678813934326, "logits/rejected": -1.5034937858581543, "logps/chosen": -51.01383590698242, "logps/rejected": -53.1749153137207, "loss": 0.6848, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.005504480563104153, "rewards/margins": 0.017458677291870117, "rewards/rejected": -0.022963156923651695, "step": 3570 }, { "epoch": 2.5792507204610953, "grad_norm": 2.9861600399017334, "learning_rate": 2.937472665558541e-09, "logits/chosen": -1.5554245710372925, "logits/rejected": -1.548001766204834, "logps/chosen": -45.340965270996094, "logps/rejected": -47.545833587646484, "loss": 0.6821, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.007226808462291956, "rewards/margins": 0.02288452349603176, "rewards/rejected": -0.030111337080597878, "step": 3580 }, { "epoch": 2.586455331412104, "grad_norm": 4.0210347175598145, "learning_rate": 2.8396688402445053e-09, "logits/chosen": -1.5757755041122437, "logits/rejected": -1.5586358308792114, "logps/chosen": -45.34209060668945, "logps/rejected": -51.52289581298828, "loss": 0.6819, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.009358217939734459, "rewards/margins": 0.023509806022047997, "rewards/rejected": -0.032868027687072754, "step": 3590 }, { "epoch": 2.5936599423631126, "grad_norm": 4.072700500488281, "learning_rate": 2.7434227947062324e-09, "logits/chosen": -1.5280077457427979, "logits/rejected": -1.5168155431747437, "logps/chosen": -53.77875518798828, "logps/rejected": -57.271522521972656, "loss": 0.6859, "rewards/accuracies": 0.59375, "rewards/chosen": -0.004612335003912449, "rewards/margins": 0.015178831294178963, "rewards/rejected": -0.019791167229413986, "step": 3600 }, { "epoch": 2.6008645533141213, "grad_norm": 3.0110554695129395, "learning_rate": 2.6487412946432976e-09, "logits/chosen": -1.4473832845687866, "logits/rejected": -1.4343912601470947, "logps/chosen": -49.441104888916016, "logps/rejected": -52.14855194091797, "loss": 0.6827, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.011908985674381256, "rewards/margins": 0.02195003628730774, "rewards/rejected": -0.033859021961688995, "step": 3610 }, { "epoch": 2.60806916426513, "grad_norm": 3.392671823501587, "learning_rate": 2.5556309957742024e-09, "logits/chosen": -1.445562481880188, "logits/rejected": -1.4361542463302612, "logps/chosen": -44.897274017333984, "logps/rejected": -51.98448944091797, "loss": 0.6809, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.002028597518801689, "rewards/margins": 0.025439077988266945, "rewards/rejected": -0.023410480469465256, "step": 3620 }, { "epoch": 2.6152737752161386, "grad_norm": 3.324449300765991, "learning_rate": 2.4640984433684758e-09, "logits/chosen": -1.559381365776062, "logits/rejected": -1.5451711416244507, "logps/chosen": -50.890586853027344, "logps/rejected": -53.00829315185547, "loss": 0.6841, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0038901835214346647, "rewards/margins": 0.01893722079694271, "rewards/rejected": -0.022827405482530594, "step": 3630 }, { "epoch": 2.6224783861671472, "grad_norm": 3.6242074966430664, "learning_rate": 2.3741500717865987e-09, "logits/chosen": -1.4463894367218018, "logits/rejected": -1.45791757106781, "logps/chosen": -47.297630310058594, "logps/rejected": -52.06903839111328, "loss": 0.6844, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0010312128579244018, "rewards/margins": 0.018220648169517517, "rewards/rejected": -0.019251862540841103, "step": 3640 }, { "epoch": 2.629682997118156, "grad_norm": 3.100571632385254, "learning_rate": 2.285792204027678e-09, "logits/chosen": -1.422215223312378, "logits/rejected": -1.4117518663406372, "logps/chosen": -47.42947006225586, "logps/rejected": -54.567771911621094, "loss": 0.6826, "rewards/accuracies": 0.625, "rewards/chosen": -0.004024113062769175, "rewards/margins": 0.021751539781689644, "rewards/rejected": -0.02577565237879753, "step": 3650 }, { "epoch": 2.636887608069164, "grad_norm": 3.6712942123413086, "learning_rate": 2.199031051284972e-09, "logits/chosen": -1.5009148120880127, "logits/rejected": -1.500221610069275, "logps/chosen": -48.28940963745117, "logps/rejected": -52.26566696166992, "loss": 0.684, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.00409685168415308, "rewards/margins": 0.01923191547393799, "rewards/rejected": -0.023328769952058792, "step": 3660 }, { "epoch": 2.6440922190201728, "grad_norm": 3.7881343364715576, "learning_rate": 2.113872712509254e-09, "logits/chosen": -1.408132791519165, "logits/rejected": -1.3988707065582275, "logps/chosen": -56.1018180847168, "logps/rejected": -59.201927185058594, "loss": 0.6834, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00847399141639471, "rewards/margins": 0.02007477544248104, "rewards/rejected": -0.028548765927553177, "step": 3670 }, { "epoch": 2.6512968299711814, "grad_norm": 3.475985050201416, "learning_rate": 2.0303231739801143e-09, "logits/chosen": -1.411527156829834, "logits/rejected": -1.398315668106079, "logps/chosen": -50.648380279541016, "logps/rejected": -54.86267852783203, "loss": 0.6846, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.008079654537141323, "rewards/margins": 0.01782270334661007, "rewards/rejected": -0.025902356952428818, "step": 3680 }, { "epoch": 2.65850144092219, "grad_norm": 3.8976125717163086, "learning_rate": 1.948388308885102e-09, "logits/chosen": -1.5758289098739624, "logits/rejected": -1.5607655048370361, "logps/chosen": -50.037288665771484, "logps/rejected": -52.93767547607422, "loss": 0.6852, "rewards/accuracies": 0.625, "rewards/chosen": -0.0028561349026858807, "rewards/margins": 0.01672014407813549, "rewards/rejected": -0.01957627758383751, "step": 3690 }, { "epoch": 2.6657060518731988, "grad_norm": 3.1043570041656494, "learning_rate": 1.86807387690692e-09, "logits/chosen": -1.5538166761398315, "logits/rejected": -1.546657919883728, "logps/chosen": -50.15034866333008, "logps/rejected": -57.51653289794922, "loss": 0.6792, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0009024621685966849, "rewards/margins": 0.029041822999715805, "rewards/rejected": -0.029944289475679398, "step": 3700 }, { "epoch": 2.6729106628242074, "grad_norm": 3.4918878078460693, "learning_rate": 1.789385523818493e-09, "logits/chosen": -1.477716326713562, "logits/rejected": -1.4800388813018799, "logps/chosen": -45.15003204345703, "logps/rejected": -51.051692962646484, "loss": 0.6825, "rewards/accuracies": 0.65625, "rewards/chosen": -0.002582186833024025, "rewards/margins": 0.021981868892908096, "rewards/rejected": -0.02456405758857727, "step": 3710 }, { "epoch": 2.680115273775216, "grad_norm": 3.5743157863616943, "learning_rate": 1.712328781086131e-09, "logits/chosen": -1.5494495630264282, "logits/rejected": -1.5334922075271606, "logps/chosen": -50.91818618774414, "logps/rejected": -53.10481643676758, "loss": 0.6865, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.006211831234395504, "rewards/margins": 0.013929562643170357, "rewards/rejected": -0.020141394808888435, "step": 3720 }, { "epoch": 2.6873198847262247, "grad_norm": 3.3937318325042725, "learning_rate": 1.6369090654806543e-09, "logits/chosen": -1.5740840435028076, "logits/rejected": -1.5617650747299194, "logps/chosen": -46.77784729003906, "logps/rejected": -51.572731018066406, "loss": 0.6847, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.007398572750389576, "rewards/margins": 0.017527595162391663, "rewards/rejected": -0.024926166981458664, "step": 3730 }, { "epoch": 2.6945244956772334, "grad_norm": 3.194082498550415, "learning_rate": 1.5631316786966498e-09, "logits/chosen": -1.4844461679458618, "logits/rejected": -1.4684514999389648, "logps/chosen": -45.005489349365234, "logps/rejected": -48.377532958984375, "loss": 0.6851, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.005601891782134771, "rewards/margins": 0.016944076865911484, "rewards/rejected": -0.022545967251062393, "step": 3740 }, { "epoch": 2.701729106628242, "grad_norm": 4.1264753341674805, "learning_rate": 1.491001806979772e-09, "logits/chosen": -1.5148539543151855, "logits/rejected": -1.5002485513687134, "logps/chosen": -50.1202507019043, "logps/rejected": -54.22260665893555, "loss": 0.6842, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0015927895437926054, "rewards/margins": 0.018657799810171127, "rewards/rejected": -0.0202505923807621, "step": 3750 }, { "epoch": 2.7089337175792507, "grad_norm": 3.740504264831543, "learning_rate": 1.4205245207621508e-09, "logits/chosen": -1.4369869232177734, "logits/rejected": -1.421118974685669, "logps/chosen": -52.8070068359375, "logps/rejected": -55.55951690673828, "loss": 0.6823, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0012057258281856775, "rewards/margins": 0.022528300061821938, "rewards/rejected": -0.023734021931886673, "step": 3760 }, { "epoch": 2.7161383285302594, "grad_norm": 3.869483232498169, "learning_rate": 1.3517047743059978e-09, "logits/chosen": -1.520595908164978, "logits/rejected": -1.5225101709365845, "logps/chosen": -49.41727066040039, "logps/rejected": -55.34285354614258, "loss": 0.6838, "rewards/accuracies": 0.6875, "rewards/chosen": -0.004268472082912922, "rewards/margins": 0.019223999232053757, "rewards/rejected": -0.023492468520998955, "step": 3770 }, { "epoch": 2.723342939481268, "grad_norm": 3.2750725746154785, "learning_rate": 1.2845474053553156e-09, "logits/chosen": -1.5188109874725342, "logits/rejected": -1.5104305744171143, "logps/chosen": -43.41191101074219, "logps/rejected": -46.98235321044922, "loss": 0.6851, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.006257681641727686, "rewards/margins": 0.01682373322546482, "rewards/rejected": -0.02308141626417637, "step": 3780 }, { "epoch": 2.7305475504322767, "grad_norm": 2.8072025775909424, "learning_rate": 1.2190571347958422e-09, "logits/chosen": -1.5447680950164795, "logits/rejected": -1.5476669073104858, "logps/chosen": -43.22075653076172, "logps/rejected": -49.99506759643555, "loss": 0.6846, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0003701849782373756, "rewards/margins": 0.01762227527797222, "rewards/rejected": -0.017252091318368912, "step": 3790 }, { "epoch": 2.7377521613832854, "grad_norm": 2.919499635696411, "learning_rate": 1.1552385663231634e-09, "logits/chosen": -1.4809738397598267, "logits/rejected": -1.4594279527664185, "logps/chosen": -48.0673942565918, "logps/rejected": -49.96904373168945, "loss": 0.6853, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004101811908185482, "rewards/margins": 0.016451913863420486, "rewards/rejected": -0.020553726702928543, "step": 3800 }, { "epoch": 2.744956772334294, "grad_norm": 3.0895450115203857, "learning_rate": 1.0930961861191302e-09, "logits/chosen": -1.4430171251296997, "logits/rejected": -1.44236159324646, "logps/chosen": -46.34189224243164, "logps/rejected": -49.78446578979492, "loss": 0.6867, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.006359028164297342, "rewards/margins": 0.01367940753698349, "rewards/rejected": -0.020038435235619545, "step": 3810 }, { "epoch": 2.7521613832853027, "grad_norm": 3.008639335632324, "learning_rate": 1.0326343625364608e-09, "logits/chosen": -1.4366722106933594, "logits/rejected": -1.4210902452468872, "logps/chosen": -46.98648452758789, "logps/rejected": -52.36387252807617, "loss": 0.6811, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.004181761294603348, "rewards/margins": 0.02516576275229454, "rewards/rejected": -0.029347527772188187, "step": 3820 }, { "epoch": 2.7593659942363113, "grad_norm": 2.6366994380950928, "learning_rate": 9.738573457917066e-10, "logits/chosen": -1.5501086711883545, "logits/rejected": -1.544032335281372, "logps/chosen": -41.07685089111328, "logps/rejected": -47.10578918457031, "loss": 0.6828, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.004288278520107269, "rewards/margins": 0.02141541615128517, "rewards/rejected": -0.02570369280874729, "step": 3830 }, { "epoch": 2.76657060518732, "grad_norm": 2.9577691555023193, "learning_rate": 9.16769267666434e-10, "logits/chosen": -1.4689620733261108, "logits/rejected": -1.4628558158874512, "logps/chosen": -46.15149688720703, "logps/rejected": -48.130184173583984, "loss": 0.6889, "rewards/accuracies": 0.5, "rewards/chosen": -0.00523525383323431, "rewards/margins": 0.00903787650167942, "rewards/rejected": -0.01427313219755888, "step": 3840 }, { "epoch": 2.7737752161383287, "grad_norm": 3.2562596797943115, "learning_rate": 8.613741412168113e-10, "logits/chosen": -1.4877763986587524, "logits/rejected": -1.4824540615081787, "logps/chosen": -54.22200393676758, "logps/rejected": -58.35784149169922, "loss": 0.6834, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0029452387243509293, "rewards/margins": 0.020176690071821213, "rewards/rejected": -0.02312193065881729, "step": 3850 }, { "epoch": 2.7809798270893373, "grad_norm": 3.3105201721191406, "learning_rate": 8.076758604914802e-10, "logits/chosen": -1.4479657411575317, "logits/rejected": -1.4352147579193115, "logps/chosen": -43.090030670166016, "logps/rejected": -46.575626373291016, "loss": 0.6847, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0015796951483935118, "rewards/margins": 0.01767609640955925, "rewards/rejected": -0.01925579458475113, "step": 3860 }, { "epoch": 2.7881844380403455, "grad_norm": 4.648325443267822, "learning_rate": 7.55678200257856e-10, "logits/chosen": -1.4449894428253174, "logits/rejected": -1.4322887659072876, "logps/chosen": -49.959930419921875, "logps/rejected": -55.334449768066406, "loss": 0.683, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.005742833949625492, "rewards/margins": 0.021091628819704056, "rewards/rejected": -0.026834463700652122, "step": 3870 }, { "epoch": 2.795389048991354, "grad_norm": 3.2336835861206055, "learning_rate": 7.053848157367315e-10, "logits/chosen": -1.467712640762329, "logits/rejected": -1.4539921283721924, "logps/chosen": -48.0994987487793, "logps/rejected": -53.06573486328125, "loss": 0.683, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0008336772443726659, "rewards/margins": 0.020950373262166977, "rewards/rejected": -0.02178405039012432, "step": 3880 }, { "epoch": 2.802593659942363, "grad_norm": 2.563777208328247, "learning_rate": 6.567992423453794e-10, "logits/chosen": -1.4956997632980347, "logits/rejected": -1.4893451929092407, "logps/chosen": -43.29277038574219, "logps/rejected": -46.525360107421875, "loss": 0.6841, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00350777106359601, "rewards/margins": 0.018815908581018448, "rewards/rejected": -0.02232367917895317, "step": 3890 }, { "epoch": 2.8097982708933715, "grad_norm": 3.1397712230682373, "learning_rate": 6.099248954489794e-10, "logits/chosen": -1.4105771780014038, "logits/rejected": -1.4090924263000488, "logps/chosen": -47.864784240722656, "logps/rejected": -53.00626754760742, "loss": 0.6837, "rewards/accuracies": 0.65625, "rewards/chosen": -0.006424476858228445, "rewards/margins": 0.019479013979434967, "rewards/rejected": -0.02590348944067955, "step": 3900 }, { "epoch": 2.81700288184438, "grad_norm": 3.630045175552368, "learning_rate": 5.647650701205653e-10, "logits/chosen": -1.5031368732452393, "logits/rejected": -1.4833770990371704, "logps/chosen": -54.33906173706055, "logps/rejected": -58.169456481933594, "loss": 0.6809, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0006442746380344033, "rewards/margins": 0.025460779666900635, "rewards/rejected": -0.02481650747358799, "step": 3910 }, { "epoch": 2.824207492795389, "grad_norm": 3.1222622394561768, "learning_rate": 5.213229409093856e-10, "logits/chosen": -1.5364577770233154, "logits/rejected": -1.5258610248565674, "logps/chosen": -52.6276741027832, "logps/rejected": -57.63709259033203, "loss": 0.6819, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0037466485518962145, "rewards/margins": 0.023729201406240463, "rewards/rejected": -0.027475852519273758, "step": 3920 }, { "epoch": 2.8314121037463975, "grad_norm": 4.3289971351623535, "learning_rate": 4.796015616177401e-10, "logits/chosen": -1.4594485759735107, "logits/rejected": -1.447249412536621, "logps/chosen": -51.74785614013672, "logps/rejected": -55.523780822753906, "loss": 0.6856, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.006563100032508373, "rewards/margins": 0.015793252736330032, "rewards/rejected": -0.02235635183751583, "step": 3930 }, { "epoch": 2.838616714697406, "grad_norm": 3.312995672225952, "learning_rate": 4.3960386508631595e-10, "logits/chosen": -1.3865981101989746, "logits/rejected": -1.386038064956665, "logps/chosen": -42.57002639770508, "logps/rejected": -46.51558303833008, "loss": 0.6854, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.005880543030798435, "rewards/margins": 0.016446929425001144, "rewards/rejected": -0.022327473387122154, "step": 3940 }, { "epoch": 2.845821325648415, "grad_norm": 4.776674747467041, "learning_rate": 4.013326629880243e-10, "logits/chosen": -1.4313756227493286, "logits/rejected": -1.4140355587005615, "logps/chosen": -50.08747863769531, "logps/rejected": -53.8878059387207, "loss": 0.6828, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0068587446585297585, "rewards/margins": 0.021546222269535065, "rewards/rejected": -0.02840496599674225, "step": 3950 }, { "epoch": 2.8530259365994235, "grad_norm": 3.445603847503662, "learning_rate": 3.64790645630339e-10, "logits/chosen": -1.3930222988128662, "logits/rejected": -1.3879890441894531, "logps/chosen": -53.27265548706055, "logps/rejected": -55.5593376159668, "loss": 0.6875, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.001219206373207271, "rewards/margins": 0.011788489297032356, "rewards/rejected": -0.01300769578665495, "step": 3960 }, { "epoch": 2.860230547550432, "grad_norm": 4.921634674072266, "learning_rate": 3.2998038176619e-10, "logits/chosen": -1.4541983604431152, "logits/rejected": -1.438096284866333, "logps/chosen": -51.39448928833008, "logps/rejected": -54.80561065673828, "loss": 0.6853, "rewards/accuracies": 0.59375, "rewards/chosen": -0.006159276235848665, "rewards/margins": 0.016386728733778, "rewards/rejected": -0.022546004503965378, "step": 3970 }, { "epoch": 2.867435158501441, "grad_norm": 3.477383613586426, "learning_rate": 2.969043184133907e-10, "logits/chosen": -1.5591028928756714, "logits/rejected": -1.5578175783157349, "logps/chosen": -44.919044494628906, "logps/rejected": -53.242408752441406, "loss": 0.6819, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.0005782361840829253, "rewards/margins": 0.023209361359477043, "rewards/rejected": -0.022631125524640083, "step": 3980 }, { "epoch": 2.8746397694524495, "grad_norm": 3.8745675086975098, "learning_rate": 2.6556478068261447e-10, "logits/chosen": -1.450866460800171, "logits/rejected": -1.4369374513626099, "logps/chosen": -44.412200927734375, "logps/rejected": -47.863582611083984, "loss": 0.6799, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0014529896434396505, "rewards/margins": 0.02753259241580963, "rewards/rejected": -0.026079604402184486, "step": 3990 }, { "epoch": 2.881844380403458, "grad_norm": 3.4836292266845703, "learning_rate": 2.3596397161395607e-10, "logits/chosen": -1.560675024986267, "logits/rejected": -1.5387744903564453, "logps/chosen": -49.524169921875, "logps/rejected": -54.56645584106445, "loss": 0.6813, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0010647645685821772, "rewards/margins": 0.024541422724723816, "rewards/rejected": -0.023476656526327133, "step": 4000 }, { "epoch": 2.889048991354467, "grad_norm": 4.788309097290039, "learning_rate": 2.0810397202206399e-10, "logits/chosen": -1.4160195589065552, "logits/rejected": -1.4113706350326538, "logps/chosen": -49.815757751464844, "logps/rejected": -53.184669494628906, "loss": 0.6849, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.001441007712855935, "rewards/margins": 0.017177799716591835, "rewards/rejected": -0.015736790373921394, "step": 4010 }, { "epoch": 2.8962536023054755, "grad_norm": 3.2461211681365967, "learning_rate": 1.819867403498737e-10, "logits/chosen": -1.5659189224243164, "logits/rejected": -1.5567631721496582, "logps/chosen": -47.738792419433594, "logps/rejected": -51.622718811035156, "loss": 0.6837, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0069535574875772, "rewards/margins": 0.019805509597063065, "rewards/rejected": -0.026759067550301552, "step": 4020 }, { "epoch": 2.903458213256484, "grad_norm": 3.430551052093506, "learning_rate": 1.5761411253092382e-10, "logits/chosen": -1.431986689567566, "logits/rejected": -1.410321593284607, "logps/chosen": -45.951541900634766, "logps/rejected": -48.03018569946289, "loss": 0.6846, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.005667565856128931, "rewards/margins": 0.017614690586924553, "rewards/rejected": -0.023282255977392197, "step": 4030 }, { "epoch": 2.910662824207493, "grad_norm": 3.561012029647827, "learning_rate": 1.3498780186031455e-10, "logits/chosen": -1.4953620433807373, "logits/rejected": -1.4863777160644531, "logps/chosen": -53.52448272705078, "logps/rejected": -57.13452911376953, "loss": 0.6849, "rewards/accuracies": 0.59375, "rewards/chosen": -0.005448926705867052, "rewards/margins": 0.017112337052822113, "rewards/rejected": -0.022561263293027878, "step": 4040 }, { "epoch": 2.9178674351585014, "grad_norm": 3.2866272926330566, "learning_rate": 1.1410939887425141e-10, "logits/chosen": -1.4994781017303467, "logits/rejected": -1.4914522171020508, "logps/chosen": -47.01377487182617, "logps/rejected": -49.50941848754883, "loss": 0.6862, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.009291494265198708, "rewards/margins": 0.01437977235764265, "rewards/rejected": -0.023671265691518784, "step": 4050 }, { "epoch": 2.92507204610951, "grad_norm": 2.916045665740967, "learning_rate": 9.498037123825686e-11, "logits/chosen": -1.5113145112991333, "logits/rejected": -1.5003492832183838, "logps/chosen": -45.075767517089844, "logps/rejected": -49.30120086669922, "loss": 0.6835, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.00341349421069026, "rewards/margins": 0.019889434799551964, "rewards/rejected": -0.02330292947590351, "step": 4060 }, { "epoch": 2.9322766570605188, "grad_norm": 3.252375841140747, "learning_rate": 7.760206364398614e-11, "logits/chosen": -1.5876257419586182, "logits/rejected": -1.566359281539917, "logps/chosen": -49.85393524169922, "logps/rejected": -53.048728942871094, "loss": 0.684, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.008436409756541252, "rewards/margins": 0.019341573119163513, "rewards/rejected": -0.027777981013059616, "step": 4070 }, { "epoch": 2.9394812680115274, "grad_norm": 3.834636688232422, "learning_rate": 6.19756977147029e-11, "logits/chosen": -1.443345308303833, "logits/rejected": -1.4344885349273682, "logps/chosen": -47.13701629638672, "logps/rejected": -54.29521942138672, "loss": 0.6833, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.00852961651980877, "rewards/margins": 0.02065226063132286, "rewards/rejected": -0.02918187901377678, "step": 4080 }, { "epoch": 2.946685878962536, "grad_norm": 2.8011093139648438, "learning_rate": 4.810237191940625e-11, "logits/chosen": -1.4434500932693481, "logits/rejected": -1.4342488050460815, "logps/chosen": -46.8648567199707, "logps/rejected": -49.80231857299805, "loss": 0.6859, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.007858057506382465, "rewards/margins": 0.01531834714114666, "rewards/rejected": -0.0231764055788517, "step": 4090 }, { "epoch": 2.9538904899135447, "grad_norm": 3.364262580871582, "learning_rate": 3.5983061495617476e-11, "logits/chosen": -1.5273181200027466, "logits/rejected": -1.5262796878814697, "logps/chosen": -51.708335876464844, "logps/rejected": -57.30168914794922, "loss": 0.6843, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0039407601580023766, "rewards/margins": 0.018400147557258606, "rewards/rejected": -0.022340910509228706, "step": 4100 }, { "epoch": 2.9610951008645534, "grad_norm": 3.0939691066741943, "learning_rate": 2.5618618380812694e-11, "logits/chosen": -1.52158784866333, "logits/rejected": -1.5068227052688599, "logps/chosen": -42.00890350341797, "logps/rejected": -47.3350830078125, "loss": 0.6815, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0033108622301369905, "rewards/margins": 0.024085834622383118, "rewards/rejected": -0.02739669755101204, "step": 4110 }, { "epoch": 2.968299711815562, "grad_norm": 3.384092092514038, "learning_rate": 1.700977115254576e-11, "logits/chosen": -1.4649953842163086, "logits/rejected": -1.4553617238998413, "logps/chosen": -46.161407470703125, "logps/rejected": -51.3560905456543, "loss": 0.6834, "rewards/accuracies": 0.625, "rewards/chosen": -0.00658357422798872, "rewards/margins": 0.020199140533804893, "rewards/rejected": -0.02678271196782589, "step": 4120 }, { "epoch": 2.9755043227665707, "grad_norm": 2.9335947036743164, "learning_rate": 1.0157124977230868e-11, "logits/chosen": -1.4357291460037231, "logits/rejected": -1.4263992309570312, "logps/chosen": -43.5537109375, "logps/rejected": -47.68435287475586, "loss": 0.6847, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0015822809655219316, "rewards/margins": 0.01767374388873577, "rewards/rejected": -0.019256027415394783, "step": 4130 }, { "epoch": 2.9827089337175794, "grad_norm": 3.5399720668792725, "learning_rate": 5.061161567596061e-12, "logits/chosen": -1.4696056842803955, "logits/rejected": -1.4569720029830933, "logps/chosen": -47.70643997192383, "logps/rejected": -50.280517578125, "loss": 0.6846, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.00033920054556801915, "rewards/margins": 0.017664710059762, "rewards/rejected": -0.018003910779953003, "step": 4140 }, { "epoch": 2.989913544668588, "grad_norm": 3.3628714084625244, "learning_rate": 1.7222391488297406e-12, "logits/chosen": -1.5176090002059937, "logits/rejected": -1.5059229135513306, "logps/chosen": -53.56816864013672, "logps/rejected": -58.52666091918945, "loss": 0.6797, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0030871466733515263, "rewards/margins": 0.028029698878526688, "rewards/rejected": -0.031116846948862076, "step": 4150 }, { "epoch": 2.9971181556195967, "grad_norm": 3.9223008155822754, "learning_rate": 1.4059243338693238e-13, "logits/chosen": -1.4434741735458374, "logits/rejected": -1.4326165914535522, "logps/chosen": -48.58505630493164, "logps/rejected": -53.30755615234375, "loss": 0.6829, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0009518606821075082, "rewards/margins": 0.021236242726445198, "rewards/rejected": -0.022188100963830948, "step": 4160 }, { "epoch": 3.0, "step": 4164, "total_flos": 0.0, "train_loss": 0.6881600442010548, "train_runtime": 6925.9603, "train_samples_per_second": 9.617, "train_steps_per_second": 0.601 } ], "logging_steps": 10, "max_steps": 4164, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }