{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 2000, "global_step": 4168, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002399232245681382, "grad_norm": 3.5056792836862627, "learning_rate": 1.199040767386091e-09, "logits/chosen": -1.4883875846862793, "logits/rejected": -1.416823148727417, "logps/chosen": -161.24717712402344, "logps/rejected": -175.51541137695312, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0023992322456813818, "grad_norm": 3.8174848446929266, "learning_rate": 1.199040767386091e-08, "logits/chosen": -1.6635231971740723, "logits/rejected": -1.6545089483261108, "logps/chosen": -398.12603759765625, "logps/rejected": -322.4006652832031, "loss": 0.6932, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.00020433742611203343, "rewards/margins": 0.000947743421420455, "rewards/rejected": -0.000743405893445015, "step": 10 }, { "epoch": 0.0047984644913627635, "grad_norm": 3.863373696596945, "learning_rate": 2.398081534772182e-08, "logits/chosen": -1.6468368768692017, "logits/rejected": -1.6784213781356812, "logps/chosen": -268.8175354003906, "logps/rejected": -237.06240844726562, "loss": 0.6927, "rewards/accuracies": 0.5, "rewards/chosen": 0.0003671760787256062, "rewards/margins": 0.00017412376473657787, "rewards/rejected": 0.00019305227033328265, "step": 20 }, { "epoch": 0.007197696737044146, "grad_norm": 3.7081110480933135, "learning_rate": 3.597122302158273e-08, "logits/chosen": -1.5468581914901733, "logits/rejected": -1.542797327041626, "logps/chosen": -266.8534240722656, "logps/rejected": -267.03790283203125, "loss": 0.6934, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0006165923550724983, "rewards/margins": 0.0003948546072933823, "rewards/rejected": 0.00022173782053869218, "step": 30 }, { "epoch": 0.009596928982725527, "grad_norm": 3.5686633612597114, "learning_rate": 4.796163069544364e-08, "logits/chosen": -1.6888965368270874, "logits/rejected": -1.651166319847107, "logps/chosen": -269.12921142578125, "logps/rejected": -259.73663330078125, "loss": 0.6925, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.001302229822613299, "rewards/margins": 0.001623004674911499, "rewards/rejected": -0.00032077505602501333, "step": 40 }, { "epoch": 0.01199616122840691, "grad_norm": 3.8364678475966993, "learning_rate": 5.995203836930455e-08, "logits/chosen": -1.7128832340240479, "logits/rejected": -1.6672801971435547, "logps/chosen": -294.93475341796875, "logps/rejected": -250.9867401123047, "loss": 0.6932, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0001523580722277984, "rewards/margins": -0.000665490108076483, "rewards/rejected": 0.0008178481948561966, "step": 50 }, { "epoch": 0.014395393474088292, "grad_norm": 3.908172449607729, "learning_rate": 7.194244604316546e-08, "logits/chosen": -1.5987389087677002, "logits/rejected": -1.609201192855835, "logps/chosen": -311.1792907714844, "logps/rejected": -277.0119323730469, "loss": 0.6935, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0004900293424725533, "rewards/margins": -1.3016722732572816e-05, "rewards/rejected": 0.0005030458560213447, "step": 60 }, { "epoch": 0.016794625719769675, "grad_norm": 3.4877417634951255, "learning_rate": 8.393285371702638e-08, "logits/chosen": -1.6401567459106445, "logits/rejected": -1.6593749523162842, "logps/chosen": -300.8813171386719, "logps/rejected": -285.99005126953125, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.0007752370438538492, "rewards/margins": 0.0012944363988935947, "rewards/rejected": -0.0005191992968320847, "step": 70 }, { "epoch": 0.019193857965451054, "grad_norm": 3.8477961171736634, "learning_rate": 9.592326139088728e-08, "logits/chosen": -1.5876344442367554, "logits/rejected": -1.619933843612671, "logps/chosen": -222.11495971679688, "logps/rejected": -259.1878967285156, "loss": 0.6932, "rewards/accuracies": 0.375, "rewards/chosen": 0.0002348280104342848, "rewards/margins": -0.0016285456949844956, "rewards/rejected": 0.00186337367631495, "step": 80 }, { "epoch": 0.021593090211132437, "grad_norm": 3.658568799391329, "learning_rate": 1.0791366906474819e-07, "logits/chosen": -1.6128196716308594, "logits/rejected": -1.612391471862793, "logps/chosen": -364.6415710449219, "logps/rejected": -313.7099914550781, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.00037602329393848777, "rewards/margins": 0.0007518329657614231, "rewards/rejected": -0.00112785620149225, "step": 90 }, { "epoch": 0.02399232245681382, "grad_norm": 3.8164854638361025, "learning_rate": 1.199040767386091e-07, "logits/chosen": -1.6457388401031494, "logits/rejected": -1.6554689407348633, "logps/chosen": -280.8997802734375, "logps/rejected": -295.6603698730469, "loss": 0.6926, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.001076525659300387, "rewards/margins": -0.00022420981258619577, "rewards/rejected": -0.0008523158612661064, "step": 100 }, { "epoch": 0.026391554702495202, "grad_norm": 3.251687359051961, "learning_rate": 1.3189448441247004e-07, "logits/chosen": -1.6135714054107666, "logits/rejected": -1.6114752292633057, "logps/chosen": -247.7429962158203, "logps/rejected": -244.31112670898438, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": -0.0007790708914399147, "rewards/margins": 0.0011699094902724028, "rewards/rejected": -0.0019489802652969956, "step": 110 }, { "epoch": 0.028790786948176585, "grad_norm": 3.6767857848694434, "learning_rate": 1.4388489208633092e-07, "logits/chosen": -1.643938660621643, "logits/rejected": -1.6287577152252197, "logps/chosen": -324.9435729980469, "logps/rejected": -294.64166259765625, "loss": 0.693, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.0025485847145318985, "rewards/margins": -0.0018857631366699934, "rewards/rejected": -0.0006628216942772269, "step": 120 }, { "epoch": 0.031190019193857964, "grad_norm": 3.378467310139603, "learning_rate": 1.5587529976019183e-07, "logits/chosen": -1.5718883275985718, "logits/rejected": -1.5992323160171509, "logps/chosen": -236.16085815429688, "logps/rejected": -323.8802795410156, "loss": 0.6924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0005537395481951535, "rewards/margins": 0.0013779096771031618, "rewards/rejected": -0.0019316490506753325, "step": 130 }, { "epoch": 0.03358925143953935, "grad_norm": 3.5565622632521476, "learning_rate": 1.6786570743405277e-07, "logits/chosen": -1.6049950122833252, "logits/rejected": -1.670636773109436, "logps/chosen": -302.82330322265625, "logps/rejected": -293.5691223144531, "loss": 0.6924, "rewards/accuracies": 0.625, "rewards/chosen": -0.0013237579260021448, "rewards/margins": 0.0033771514426916838, "rewards/rejected": -0.004700910300016403, "step": 140 }, { "epoch": 0.03598848368522073, "grad_norm": 3.7241887452710385, "learning_rate": 1.7985611510791365e-07, "logits/chosen": -1.6608047485351562, "logits/rejected": -1.6172984838485718, "logps/chosen": -247.96939086914062, "logps/rejected": -241.17111206054688, "loss": 0.6922, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.001827596453949809, "rewards/margins": 0.0031149538699537516, "rewards/rejected": -0.004942550323903561, "step": 150 }, { "epoch": 0.03838771593090211, "grad_norm": 3.7650474405725984, "learning_rate": 1.9184652278177456e-07, "logits/chosen": -1.623676061630249, "logits/rejected": -1.58232843875885, "logps/chosen": -320.1465759277344, "logps/rejected": -246.5814971923828, "loss": 0.6922, "rewards/accuracies": 0.5, "rewards/chosen": -0.0039003852289170027, "rewards/margins": 8.531531420885585e-06, "rewards/rejected": -0.003908916376531124, "step": 160 }, { "epoch": 0.040786948176583494, "grad_norm": 3.3742190114845125, "learning_rate": 2.038369304556355e-07, "logits/chosen": -1.6414167881011963, "logits/rejected": -1.6517263650894165, "logps/chosen": -365.78070068359375, "logps/rejected": -355.07684326171875, "loss": 0.6918, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.003375888569280505, "rewards/margins": 0.003419106360524893, "rewards/rejected": -0.006794995162636042, "step": 170 }, { "epoch": 0.04318618042226487, "grad_norm": 3.893923672357319, "learning_rate": 2.1582733812949638e-07, "logits/chosen": -1.6334331035614014, "logits/rejected": -1.6182676553726196, "logps/chosen": -249.95315551757812, "logps/rejected": -243.65365600585938, "loss": 0.6921, "rewards/accuracies": 0.625, "rewards/chosen": -0.003182569518685341, "rewards/margins": 0.005136819090694189, "rewards/rejected": -0.008319388143718243, "step": 180 }, { "epoch": 0.04558541266794626, "grad_norm": 4.328005991935876, "learning_rate": 2.278177458033573e-07, "logits/chosen": -1.59438157081604, "logits/rejected": -1.598433256149292, "logps/chosen": -328.0663757324219, "logps/rejected": -265.6521911621094, "loss": 0.691, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.005676161497831345, "rewards/margins": 0.00224656006321311, "rewards/rejected": -0.007922721095383167, "step": 190 }, { "epoch": 0.04798464491362764, "grad_norm": 3.4669166404466267, "learning_rate": 2.398081534772182e-07, "logits/chosen": -1.715904951095581, "logits/rejected": -1.6798690557479858, "logps/chosen": -330.5942687988281, "logps/rejected": -312.1226806640625, "loss": 0.691, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007675034459680319, "rewards/margins": 0.004218742251396179, "rewards/rejected": -0.011893777176737785, "step": 200 }, { "epoch": 0.05038387715930902, "grad_norm": 3.550601001859104, "learning_rate": 2.517985611510791e-07, "logits/chosen": -1.6995878219604492, "logits/rejected": -1.7092031240463257, "logps/chosen": -247.4270477294922, "logps/rejected": -271.40960693359375, "loss": 0.6902, "rewards/accuracies": 0.75, "rewards/chosen": -0.006873033009469509, "rewards/margins": 0.006390347145497799, "rewards/rejected": -0.013263382017612457, "step": 210 }, { "epoch": 0.052783109404990404, "grad_norm": 3.5506079722939763, "learning_rate": 2.637889688249401e-07, "logits/chosen": -1.6824872493743896, "logits/rejected": -1.692633032798767, "logps/chosen": -331.85870361328125, "logps/rejected": -327.26947021484375, "loss": 0.6902, "rewards/accuracies": 0.625, "rewards/chosen": -0.012020569294691086, "rewards/margins": 0.00552480760961771, "rewards/rejected": -0.01754537597298622, "step": 220 }, { "epoch": 0.05518234165067178, "grad_norm": 3.7749557114917605, "learning_rate": 2.7577937649880093e-07, "logits/chosen": -1.6799322366714478, "logits/rejected": -1.7364540100097656, "logps/chosen": -252.82473754882812, "logps/rejected": -288.5075378417969, "loss": 0.6887, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.011706655845046043, "rewards/margins": 0.01080317609012127, "rewards/rejected": -0.022509830072522163, "step": 230 }, { "epoch": 0.05758157389635317, "grad_norm": 4.1857193206793015, "learning_rate": 2.8776978417266184e-07, "logits/chosen": -1.6128301620483398, "logits/rejected": -1.5980497598648071, "logps/chosen": -320.16375732421875, "logps/rejected": -270.80133056640625, "loss": 0.6867, "rewards/accuracies": 0.625, "rewards/chosen": -0.010916762053966522, "rewards/margins": 0.015229749493300915, "rewards/rejected": -0.026146510615944862, "step": 240 }, { "epoch": 0.05998080614203455, "grad_norm": 3.8497859603369804, "learning_rate": 2.997601918465228e-07, "logits/chosen": -1.6171119213104248, "logits/rejected": -1.5841248035430908, "logps/chosen": -254.93936157226562, "logps/rejected": -243.93179321289062, "loss": 0.6871, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.01718009077012539, "rewards/margins": 0.009010560810565948, "rewards/rejected": -0.026190653443336487, "step": 250 }, { "epoch": 0.06238003838771593, "grad_norm": 3.705368784982066, "learning_rate": 3.1175059952038366e-07, "logits/chosen": -1.6593215465545654, "logits/rejected": -1.6591377258300781, "logps/chosen": -282.84100341796875, "logps/rejected": -276.47125244140625, "loss": 0.6854, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.01674341782927513, "rewards/margins": 0.015696872025728226, "rewards/rejected": -0.03244028985500336, "step": 260 }, { "epoch": 0.0647792706333973, "grad_norm": 3.601297170964158, "learning_rate": 3.2374100719424457e-07, "logits/chosen": -1.536478042602539, "logits/rejected": -1.5124893188476562, "logps/chosen": -303.76104736328125, "logps/rejected": -241.261474609375, "loss": 0.684, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.017451269552111626, "rewards/margins": 0.009415589272975922, "rewards/rejected": -0.026866856962442398, "step": 270 }, { "epoch": 0.0671785028790787, "grad_norm": 3.9036742193164975, "learning_rate": 3.3573141486810554e-07, "logits/chosen": -1.5547986030578613, "logits/rejected": -1.5795847177505493, "logps/chosen": -316.18121337890625, "logps/rejected": -299.7501220703125, "loss": 0.6794, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.02595636248588562, "rewards/margins": 0.026472270488739014, "rewards/rejected": -0.052428632974624634, "step": 280 }, { "epoch": 0.06957773512476008, "grad_norm": 3.3624197530231363, "learning_rate": 3.477218225419664e-07, "logits/chosen": -1.6303586959838867, "logits/rejected": -1.6242969036102295, "logps/chosen": -305.33953857421875, "logps/rejected": -274.9311218261719, "loss": 0.6799, "rewards/accuracies": 0.75, "rewards/chosen": -0.027737725526094437, "rewards/margins": 0.02691362239420414, "rewards/rejected": -0.05465134233236313, "step": 290 }, { "epoch": 0.07197696737044146, "grad_norm": 4.058047991319342, "learning_rate": 3.597122302158273e-07, "logits/chosen": -1.6332571506500244, "logits/rejected": -1.6581714153289795, "logps/chosen": -276.0576171875, "logps/rejected": -290.2926940917969, "loss": 0.6784, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04013335332274437, "rewards/margins": 0.030186835676431656, "rewards/rejected": -0.07032018899917603, "step": 300 }, { "epoch": 0.07437619961612284, "grad_norm": 3.4101110193570454, "learning_rate": 3.7170263788968827e-07, "logits/chosen": -1.6190553903579712, "logits/rejected": -1.6291663646697998, "logps/chosen": -292.63751220703125, "logps/rejected": -243.106201171875, "loss": 0.6808, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.040502794086933136, "rewards/margins": 0.03982759267091751, "rewards/rejected": -0.08033039420843124, "step": 310 }, { "epoch": 0.07677543186180422, "grad_norm": 3.351913067776505, "learning_rate": 3.836930455635491e-07, "logits/chosen": -1.6056486368179321, "logits/rejected": -1.5875630378723145, "logps/chosen": -295.599853515625, "logps/rejected": -259.6369934082031, "loss": 0.6774, "rewards/accuracies": 0.625, "rewards/chosen": -0.0446627140045166, "rewards/margins": 0.02809850312769413, "rewards/rejected": -0.07276121526956558, "step": 320 }, { "epoch": 0.07917466410748561, "grad_norm": 3.4017416105630573, "learning_rate": 3.9568345323741003e-07, "logits/chosen": -1.5247066020965576, "logits/rejected": -1.5482518672943115, "logps/chosen": -268.7400207519531, "logps/rejected": -309.99786376953125, "loss": 0.6729, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0597003698348999, "rewards/margins": 0.05082261562347412, "rewards/rejected": -0.11052300035953522, "step": 330 }, { "epoch": 0.08157389635316699, "grad_norm": 3.3527499070225244, "learning_rate": 4.07673860911271e-07, "logits/chosen": -1.6196010112762451, "logits/rejected": -1.595496416091919, "logps/chosen": -257.72515869140625, "logps/rejected": -282.81781005859375, "loss": 0.6716, "rewards/accuracies": 0.75, "rewards/chosen": -0.05155477672815323, "rewards/margins": 0.06573888659477234, "rewards/rejected": -0.11729365587234497, "step": 340 }, { "epoch": 0.08397312859884837, "grad_norm": 3.8502979336068717, "learning_rate": 4.1966426858513185e-07, "logits/chosen": -1.6366294622421265, "logits/rejected": -1.6474313735961914, "logps/chosen": -299.79010009765625, "logps/rejected": -299.626220703125, "loss": 0.6741, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09185166656970978, "rewards/margins": 0.03506358712911606, "rewards/rejected": -0.12691523134708405, "step": 350 }, { "epoch": 0.08637236084452975, "grad_norm": 4.087127125149584, "learning_rate": 4.3165467625899276e-07, "logits/chosen": -1.6223100423812866, "logits/rejected": -1.5971088409423828, "logps/chosen": -292.7254333496094, "logps/rejected": -240.09213256835938, "loss": 0.6693, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.10348290205001831, "rewards/margins": 0.053151585161685944, "rewards/rejected": -0.15663447976112366, "step": 360 }, { "epoch": 0.08877159309021113, "grad_norm": 3.962698259273254, "learning_rate": 4.436450839328537e-07, "logits/chosen": -1.5964925289154053, "logits/rejected": -1.617010474205017, "logps/chosen": -265.0798034667969, "logps/rejected": -278.52117919921875, "loss": 0.6651, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1335074007511139, "rewards/margins": 0.0590825080871582, "rewards/rejected": -0.1925898939371109, "step": 370 }, { "epoch": 0.09117082533589252, "grad_norm": 3.5033230698831668, "learning_rate": 4.556354916067146e-07, "logits/chosen": -1.5588051080703735, "logits/rejected": -1.5864968299865723, "logps/chosen": -267.39471435546875, "logps/rejected": -266.3304443359375, "loss": 0.6546, "rewards/accuracies": 0.75, "rewards/chosen": -0.09075454622507095, "rewards/margins": 0.07683941721916199, "rewards/rejected": -0.16759395599365234, "step": 380 }, { "epoch": 0.0935700575815739, "grad_norm": 3.9053311567068993, "learning_rate": 4.676258992805755e-07, "logits/chosen": -1.5949430465698242, "logits/rejected": -1.5674622058868408, "logps/chosen": -301.2566833496094, "logps/rejected": -272.92877197265625, "loss": 0.6529, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16490274667739868, "rewards/margins": 0.055955369025468826, "rewards/rejected": -0.2208581268787384, "step": 390 }, { "epoch": 0.09596928982725528, "grad_norm": 3.9140466546998827, "learning_rate": 4.796163069544364e-07, "logits/chosen": -1.6508136987686157, "logits/rejected": -1.6695753335952759, "logps/chosen": -300.67315673828125, "logps/rejected": -291.95501708984375, "loss": 0.6473, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.21914562582969666, "rewards/margins": 0.147565096616745, "rewards/rejected": -0.36671072244644165, "step": 400 }, { "epoch": 0.09836852207293666, "grad_norm": 4.42678932238528, "learning_rate": 4.916067146282974e-07, "logits/chosen": -1.5839338302612305, "logits/rejected": -1.5947954654693604, "logps/chosen": -296.02362060546875, "logps/rejected": -332.89208984375, "loss": 0.6292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3386813700199127, "rewards/margins": 0.144867405295372, "rewards/rejected": -0.4835488200187683, "step": 410 }, { "epoch": 0.10076775431861804, "grad_norm": 5.2789138321700495, "learning_rate": 4.999992108529978e-07, "logits/chosen": -1.5680617094039917, "logits/rejected": -1.5661697387695312, "logps/chosen": -414.2063903808594, "logps/rejected": -393.57598876953125, "loss": 0.6447, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5759106874465942, "rewards/margins": 0.245382621884346, "rewards/rejected": -0.8212932348251343, "step": 420 }, { "epoch": 0.10316698656429943, "grad_norm": 4.983795366931148, "learning_rate": 4.999851817115532e-07, "logits/chosen": -1.634783387184143, "logits/rejected": -1.6127817630767822, "logps/chosen": -324.0008850097656, "logps/rejected": -326.49810791015625, "loss": 0.6439, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.47741881012916565, "rewards/margins": 0.19091394543647766, "rewards/rejected": -0.6683326959609985, "step": 430 }, { "epoch": 0.10556621880998081, "grad_norm": 3.631330415762284, "learning_rate": 4.999536171027889e-07, "logits/chosen": -1.6043802499771118, "logits/rejected": -1.597813367843628, "logps/chosen": -332.45391845703125, "logps/rejected": -329.4422912597656, "loss": 0.6432, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5105425119400024, "rewards/margins": 0.08223161846399307, "rewards/rejected": -0.5927742123603821, "step": 440 }, { "epoch": 0.10796545105566219, "grad_norm": 4.090138150318208, "learning_rate": 4.999045192408369e-07, "logits/chosen": -1.506775140762329, "logits/rejected": -1.4702590703964233, "logps/chosen": -296.05792236328125, "logps/rejected": -289.3249816894531, "loss": 0.6316, "rewards/accuracies": 0.625, "rewards/chosen": -0.3827100694179535, "rewards/margins": 0.15496531128883362, "rewards/rejected": -0.5376753807067871, "step": 450 }, { "epoch": 0.11036468330134357, "grad_norm": 4.139361867296913, "learning_rate": 4.998378915697171e-07, "logits/chosen": -1.569830298423767, "logits/rejected": -1.5698078870773315, "logps/chosen": -325.01458740234375, "logps/rejected": -337.7842712402344, "loss": 0.6148, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2572017014026642, "rewards/margins": 0.24531395733356476, "rewards/rejected": -0.5025156140327454, "step": 460 }, { "epoch": 0.11276391554702495, "grad_norm": 4.5407974858594455, "learning_rate": 4.997537387630958e-07, "logits/chosen": -1.53157639503479, "logits/rejected": -1.498214840888977, "logps/chosen": -273.22802734375, "logps/rejected": -292.3331298828125, "loss": 0.6057, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3938341736793518, "rewards/margins": 0.17262205481529236, "rewards/rejected": -0.5664561986923218, "step": 470 }, { "epoch": 0.11516314779270634, "grad_norm": 4.783089704109638, "learning_rate": 4.996520667239582e-07, "logits/chosen": -1.5579659938812256, "logits/rejected": -1.5760862827301025, "logps/chosen": -306.2027282714844, "logps/rejected": -380.0113525390625, "loss": 0.6096, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.42028918862342834, "rewards/margins": 0.2578314542770386, "rewards/rejected": -0.6781206727027893, "step": 480 }, { "epoch": 0.11756238003838772, "grad_norm": 4.8302096742139105, "learning_rate": 4.995328825841939e-07, "logits/chosen": -1.5267994403839111, "logits/rejected": -1.5366883277893066, "logps/chosen": -285.08990478515625, "logps/rejected": -315.9127197265625, "loss": 0.5931, "rewards/accuracies": 0.75, "rewards/chosen": -0.34838151931762695, "rewards/margins": 0.4001844525337219, "rewards/rejected": -0.7485659122467041, "step": 490 }, { "epoch": 0.1199616122840691, "grad_norm": 4.655267788061174, "learning_rate": 4.993961947040967e-07, "logits/chosen": -1.5378767251968384, "logits/rejected": -1.5352329015731812, "logps/chosen": -361.957275390625, "logps/rejected": -334.80194091796875, "loss": 0.609, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6517360806465149, "rewards/margins": 0.10645874589681625, "rewards/rejected": -0.7581947445869446, "step": 500 }, { "epoch": 0.12236084452975048, "grad_norm": 4.03890292179432, "learning_rate": 4.992420126717784e-07, "logits/chosen": -1.5456221103668213, "logits/rejected": -1.5595461130142212, "logps/chosen": -313.9156188964844, "logps/rejected": -357.70074462890625, "loss": 0.607, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.3863525092601776, "rewards/margins": 0.5194737911224365, "rewards/rejected": -0.9058262705802917, "step": 510 }, { "epoch": 0.12476007677543186, "grad_norm": 4.515523592913458, "learning_rate": 4.990703473024958e-07, "logits/chosen": -1.5077584981918335, "logits/rejected": -1.508772611618042, "logps/chosen": -363.2955322265625, "logps/rejected": -380.5810852050781, "loss": 0.6185, "rewards/accuracies": 0.75, "rewards/chosen": -0.5841118097305298, "rewards/margins": 0.2735467553138733, "rewards/rejected": -0.8576586842536926, "step": 520 }, { "epoch": 0.12715930902111325, "grad_norm": 4.68328497340501, "learning_rate": 4.98881210637893e-07, "logits/chosen": -1.4954583644866943, "logits/rejected": -1.5232939720153809, "logps/chosen": -274.912353515625, "logps/rejected": -337.65228271484375, "loss": 0.6211, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.3999376893043518, "rewards/margins": 0.3088452219963074, "rewards/rejected": -0.7087828516960144, "step": 530 }, { "epoch": 0.1295585412667946, "grad_norm": 5.719715088463468, "learning_rate": 4.986746159451553e-07, "logits/chosen": -1.5417931079864502, "logits/rejected": -1.5638864040374756, "logps/chosen": -317.2779846191406, "logps/rejected": -345.56207275390625, "loss": 0.6037, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.46498027443885803, "rewards/margins": 0.4305883049964905, "rewards/rejected": -0.8955684900283813, "step": 540 }, { "epoch": 0.131957773512476, "grad_norm": 3.924618462296512, "learning_rate": 4.984505777160795e-07, "logits/chosen": -1.4803025722503662, "logits/rejected": -1.5034992694854736, "logps/chosen": -383.60009765625, "logps/rejected": -410.09637451171875, "loss": 0.6196, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.622631847858429, "rewards/margins": 0.29748308658599854, "rewards/rejected": -0.9201149940490723, "step": 550 }, { "epoch": 0.1343570057581574, "grad_norm": 5.000411140737786, "learning_rate": 4.982091116660574e-07, "logits/chosen": -1.5640392303466797, "logits/rejected": -1.5526823997497559, "logps/chosen": -262.2401123046875, "logps/rejected": -251.67489624023438, "loss": 0.6329, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.4863004684448242, "rewards/margins": 0.13453409075737, "rewards/rejected": -0.6208345890045166, "step": 560 }, { "epoch": 0.13675623800383876, "grad_norm": 5.426495125083168, "learning_rate": 4.979502347329732e-07, "logits/chosen": -1.545018196105957, "logits/rejected": -1.5341848134994507, "logps/chosen": -359.99212646484375, "logps/rejected": -430.8915100097656, "loss": 0.61, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6372441053390503, "rewards/margins": 0.4039608836174011, "rewards/rejected": -1.0412050485610962, "step": 570 }, { "epoch": 0.13915547024952016, "grad_norm": 4.977101807860706, "learning_rate": 4.976739650760151e-07, "logits/chosen": -1.6066020727157593, "logits/rejected": -1.5763802528381348, "logps/chosen": -331.3481750488281, "logps/rejected": -358.0259704589844, "loss": 0.5891, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5530639886856079, "rewards/margins": 0.3901999890804291, "rewards/rejected": -0.9432638883590698, "step": 580 }, { "epoch": 0.14155470249520152, "grad_norm": 6.414020548691111, "learning_rate": 4.97380322074402e-07, "logits/chosen": -1.5697975158691406, "logits/rejected": -1.545709252357483, "logps/chosen": -320.78094482421875, "logps/rejected": -346.2607116699219, "loss": 0.6312, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8397436141967773, "rewards/margins": 0.30745047330856323, "rewards/rejected": -1.1471941471099854, "step": 590 }, { "epoch": 0.14395393474088292, "grad_norm": 5.342440443936391, "learning_rate": 4.970693263260237e-07, "logits/chosen": -1.4645593166351318, "logits/rejected": -1.4802438020706177, "logps/chosen": -373.068359375, "logps/rejected": -366.2574157714844, "loss": 0.6067, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.551115870475769, "rewards/margins": 0.47067826986312866, "rewards/rejected": -1.021794080734253, "step": 600 }, { "epoch": 0.1463531669865643, "grad_norm": 5.144967895606553, "learning_rate": 4.967409996459966e-07, "logits/chosen": -1.6260372400283813, "logits/rejected": -1.6305965185165405, "logps/chosen": -339.40875244140625, "logps/rejected": -351.1748046875, "loss": 0.593, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5743804574012756, "rewards/margins": 0.37975504994392395, "rewards/rejected": -0.954135537147522, "step": 610 }, { "epoch": 0.14875239923224567, "grad_norm": 5.888969372392584, "learning_rate": 4.963953650651326e-07, "logits/chosen": -1.5199607610702515, "logits/rejected": -1.4783246517181396, "logps/chosen": -454.8645935058594, "logps/rejected": -392.8035888671875, "loss": 0.5956, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8411262631416321, "rewards/margins": 0.4142216742038727, "rewards/rejected": -1.2553479671478271, "step": 620 }, { "epoch": 0.15115163147792707, "grad_norm": 5.241290267394913, "learning_rate": 4.960324468283248e-07, "logits/chosen": -1.5408384799957275, "logits/rejected": -1.5691133737564087, "logps/chosen": -291.14508056640625, "logps/rejected": -330.90277099609375, "loss": 0.5691, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5521666407585144, "rewards/margins": 0.4039185047149658, "rewards/rejected": -0.956085205078125, "step": 630 }, { "epoch": 0.15355086372360843, "grad_norm": 6.034628988888004, "learning_rate": 4.956522703928451e-07, "logits/chosen": -1.5112297534942627, "logits/rejected": -1.5096272230148315, "logps/chosen": -317.29400634765625, "logps/rejected": -367.8897705078125, "loss": 0.5808, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6753019690513611, "rewards/margins": 0.42325735092163086, "rewards/rejected": -1.0985593795776367, "step": 640 }, { "epoch": 0.15595009596928983, "grad_norm": 6.06042461193669, "learning_rate": 4.952548624265606e-07, "logits/chosen": -1.462241530418396, "logits/rejected": -1.4188714027404785, "logps/chosen": -373.4043884277344, "logps/rejected": -387.77923583984375, "loss": 0.6217, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8461149334907532, "rewards/margins": 0.27142664790153503, "rewards/rejected": -1.1175415515899658, "step": 650 }, { "epoch": 0.15834932821497122, "grad_norm": 4.638041821009946, "learning_rate": 4.948402508060607e-07, "logits/chosen": -1.532447338104248, "logits/rejected": -1.5247899293899536, "logps/chosen": -303.63494873046875, "logps/rejected": -343.4326171875, "loss": 0.6065, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5687133073806763, "rewards/margins": 0.5383566617965698, "rewards/rejected": -1.107069969177246, "step": 660 }, { "epoch": 0.16074856046065258, "grad_norm": 4.7563938394985525, "learning_rate": 4.944084646147038e-07, "logits/chosen": -1.5980417728424072, "logits/rejected": -1.5698829889297485, "logps/chosen": -385.9518737792969, "logps/rejected": -372.51214599609375, "loss": 0.6372, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5248526930809021, "rewards/margins": 0.12505970895290375, "rewards/rejected": -0.649912416934967, "step": 670 }, { "epoch": 0.16314779270633398, "grad_norm": 4.759002323226384, "learning_rate": 4.939595341405754e-07, "logits/chosen": -1.5244510173797607, "logits/rejected": -1.5224545001983643, "logps/chosen": -313.8047180175781, "logps/rejected": -315.26263427734375, "loss": 0.6141, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4212771952152252, "rewards/margins": 0.25005480647087097, "rewards/rejected": -0.671332061290741, "step": 680 }, { "epoch": 0.16554702495201534, "grad_norm": 5.223029601556046, "learning_rate": 4.93493490874365e-07, "logits/chosen": -1.479612946510315, "logits/rejected": -1.4728684425354004, "logps/chosen": -311.44775390625, "logps/rejected": -350.54779052734375, "loss": 0.5745, "rewards/accuracies": 0.75, "rewards/chosen": -0.564383864402771, "rewards/margins": 0.3139537572860718, "rewards/rejected": -0.8783376812934875, "step": 690 }, { "epoch": 0.16794625719769674, "grad_norm": 5.40070237050068, "learning_rate": 4.93010367507156e-07, "logits/chosen": -1.4617944955825806, "logits/rejected": -1.4599894285202026, "logps/chosen": -294.10797119140625, "logps/rejected": -335.54705810546875, "loss": 0.5637, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6340052485466003, "rewards/margins": 0.6616395711898804, "rewards/rejected": -1.2956448793411255, "step": 700 }, { "epoch": 0.17034548944337813, "grad_norm": 6.012195841421261, "learning_rate": 4.925101979281332e-07, "logits/chosen": -1.4645987749099731, "logits/rejected": -1.453375220298767, "logps/chosen": -395.05718994140625, "logps/rejected": -391.1450500488281, "loss": 0.6028, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7132852077484131, "rewards/margins": 0.5267443060874939, "rewards/rejected": -1.2400295734405518, "step": 710 }, { "epoch": 0.1727447216890595, "grad_norm": 5.2059132091980596, "learning_rate": 4.919930172222054e-07, "logits/chosen": -1.5603594779968262, "logits/rejected": -1.5616681575775146, "logps/chosen": -337.87713623046875, "logps/rejected": -388.90130615234375, "loss": 0.5578, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7377550005912781, "rewards/margins": 0.5399189591407776, "rewards/rejected": -1.2776739597320557, "step": 720 }, { "epoch": 0.1751439539347409, "grad_norm": 11.822921470671519, "learning_rate": 4.914588616675445e-07, "logits/chosen": -1.5943918228149414, "logits/rejected": -1.606041669845581, "logps/chosen": -346.92205810546875, "logps/rejected": -378.21240234375, "loss": 0.6279, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8934835195541382, "rewards/margins": 0.5335273146629333, "rewards/rejected": -1.4270107746124268, "step": 730 }, { "epoch": 0.17754318618042225, "grad_norm": 5.211631403180053, "learning_rate": 4.909077687330404e-07, "logits/chosen": -1.5509735345840454, "logits/rejected": -1.4975712299346924, "logps/chosen": -349.6174011230469, "logps/rejected": -349.61492919921875, "loss": 0.5735, "rewards/accuracies": 0.75, "rewards/chosen": -0.6521992087364197, "rewards/margins": 0.30101969838142395, "rewards/rejected": -0.9532188177108765, "step": 740 }, { "epoch": 0.17994241842610365, "grad_norm": 5.432660278163866, "learning_rate": 4.903397770756729e-07, "logits/chosen": -1.5139765739440918, "logits/rejected": -1.5181429386138916, "logps/chosen": -341.566162109375, "logps/rejected": -375.05828857421875, "loss": 0.5879, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6539328098297119, "rewards/margins": 0.38768962025642395, "rewards/rejected": -1.0416224002838135, "step": 750 }, { "epoch": 0.18234165067178504, "grad_norm": 5.539798409070412, "learning_rate": 4.897549265378004e-07, "logits/chosen": -1.5405504703521729, "logits/rejected": -1.4953763484954834, "logps/chosen": -427.6753845214844, "logps/rejected": -441.68365478515625, "loss": 0.5945, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8601313829421997, "rewards/margins": 0.2185048609972, "rewards/rejected": -1.0786362886428833, "step": 760 }, { "epoch": 0.1847408829174664, "grad_norm": 6.131059558759629, "learning_rate": 4.891532581443643e-07, "logits/chosen": -1.4995934963226318, "logits/rejected": -1.4943822622299194, "logps/chosen": -385.32073974609375, "logps/rejected": -454.878173828125, "loss": 0.5706, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6404799222946167, "rewards/margins": 0.7536662817001343, "rewards/rejected": -1.394146203994751, "step": 770 }, { "epoch": 0.1871401151631478, "grad_norm": 7.531135867694059, "learning_rate": 4.885348141000122e-07, "logits/chosen": -1.5458956956863403, "logits/rejected": -1.5822408199310303, "logps/chosen": -344.25244140625, "logps/rejected": -396.47247314453125, "loss": 0.5803, "rewards/accuracies": 0.75, "rewards/chosen": -0.8405560255050659, "rewards/margins": 0.45578351616859436, "rewards/rejected": -1.296339511871338, "step": 780 }, { "epoch": 0.18953934740882916, "grad_norm": 6.102690327063579, "learning_rate": 4.878996377861367e-07, "logits/chosen": -1.5642445087432861, "logits/rejected": -1.5695910453796387, "logps/chosen": -318.0394592285156, "logps/rejected": -365.24224853515625, "loss": 0.5556, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8526245951652527, "rewards/margins": 0.4370867609977722, "rewards/rejected": -1.2897112369537354, "step": 790 }, { "epoch": 0.19193857965451055, "grad_norm": 6.832280804361678, "learning_rate": 4.872477737578327e-07, "logits/chosen": -1.4795372486114502, "logits/rejected": -1.4752862453460693, "logps/chosen": -411.67425537109375, "logps/rejected": -499.11505126953125, "loss": 0.5499, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0738346576690674, "rewards/margins": 1.004138708114624, "rewards/rejected": -2.0779738426208496, "step": 800 }, { "epoch": 0.19433781190019195, "grad_norm": 8.284630220336107, "learning_rate": 4.865792677407718e-07, "logits/chosen": -1.593660593032837, "logits/rejected": -1.5664924383163452, "logps/chosen": -356.3196105957031, "logps/rejected": -367.1153564453125, "loss": 0.5915, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9886842966079712, "rewards/margins": 0.3350405991077423, "rewards/rejected": -1.3237249851226807, "step": 810 }, { "epoch": 0.1967370441458733, "grad_norm": 5.96781876097138, "learning_rate": 4.858941666279955e-07, "logits/chosen": -1.6477540731430054, "logits/rejected": -1.6294025182724, "logps/chosen": -371.42059326171875, "logps/rejected": -375.2044982910156, "loss": 0.5992, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.831330418586731, "rewards/margins": 0.20967264473438263, "rewards/rejected": -1.0410031080245972, "step": 820 }, { "epoch": 0.1991362763915547, "grad_norm": 5.266959588580237, "learning_rate": 4.851925184766247e-07, "logits/chosen": -1.5675857067108154, "logits/rejected": -1.504521131515503, "logps/chosen": -352.08807373046875, "logps/rejected": -370.45294189453125, "loss": 0.5815, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.783744215965271, "rewards/margins": 0.4445928931236267, "rewards/rejected": -1.2283371686935425, "step": 830 }, { "epoch": 0.20153550863723607, "grad_norm": 7.5633333882780365, "learning_rate": 4.844743725044897e-07, "logits/chosen": -1.6219160556793213, "logits/rejected": -1.5205755233764648, "logps/chosen": -343.59161376953125, "logps/rejected": -349.74249267578125, "loss": 0.5914, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7424141764640808, "rewards/margins": 0.3212757408618927, "rewards/rejected": -1.0636898279190063, "step": 840 }, { "epoch": 0.20393474088291746, "grad_norm": 5.7426817744180045, "learning_rate": 4.837397790866774e-07, "logits/chosen": -1.511488914489746, "logits/rejected": -1.5107842683792114, "logps/chosen": -362.03057861328125, "logps/rejected": -407.59161376953125, "loss": 0.584, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5177355408668518, "rewards/margins": 0.7411140203475952, "rewards/rejected": -1.2588495016098022, "step": 850 }, { "epoch": 0.20633397312859886, "grad_norm": 6.495429597968071, "learning_rate": 4.829887897519974e-07, "logits/chosen": -1.4964102506637573, "logits/rejected": -1.5037615299224854, "logps/chosen": -321.86236572265625, "logps/rejected": -395.5457763671875, "loss": 0.5886, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7702068090438843, "rewards/margins": 0.5501433610916138, "rewards/rejected": -1.3203500509262085, "step": 860 }, { "epoch": 0.20873320537428022, "grad_norm": 5.5808817971322116, "learning_rate": 4.82221457179368e-07, "logits/chosen": -1.488948941230774, "logits/rejected": -1.5117526054382324, "logps/chosen": -384.2512512207031, "logps/rejected": -485.3260803222656, "loss": 0.5547, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9392452239990234, "rewards/margins": 1.2166972160339355, "rewards/rejected": -2.155942440032959, "step": 870 }, { "epoch": 0.21113243761996162, "grad_norm": 7.0272013657172625, "learning_rate": 4.814378351941206e-07, "logits/chosen": -1.520959734916687, "logits/rejected": -1.5359563827514648, "logps/chosen": -357.6517333984375, "logps/rejected": -377.56085205078125, "loss": 0.5854, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8206228017807007, "rewards/margins": 0.3496705889701843, "rewards/rejected": -1.1702934503555298, "step": 880 }, { "epoch": 0.21353166986564298, "grad_norm": 6.3594913416138885, "learning_rate": 4.806379787642241e-07, "logits/chosen": -1.466625452041626, "logits/rejected": -1.4915995597839355, "logps/chosen": -374.0470886230469, "logps/rejected": -476.73828125, "loss": 0.6061, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0663881301879883, "rewards/margins": 1.0952132940292358, "rewards/rejected": -2.1616015434265137, "step": 890 }, { "epoch": 0.21593090211132437, "grad_norm": 6.116338715106648, "learning_rate": 4.798219439964293e-07, "logits/chosen": -1.5042366981506348, "logits/rejected": -1.5534061193466187, "logps/chosen": -364.03472900390625, "logps/rejected": -392.703857421875, "loss": 0.5509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9239017367362976, "rewards/margins": 0.20848917961120605, "rewards/rejected": -1.1323908567428589, "step": 900 }, { "epoch": 0.21833013435700577, "grad_norm": 34.61975290546189, "learning_rate": 4.78989788132333e-07, "logits/chosen": -1.5566984415054321, "logits/rejected": -1.5171282291412354, "logps/chosen": -326.6298828125, "logps/rejected": -439.526123046875, "loss": 0.5439, "rewards/accuracies": 0.75, "rewards/chosen": -0.8856453895568848, "rewards/margins": 1.1186928749084473, "rewards/rejected": -2.004338264465332, "step": 910 }, { "epoch": 0.22072936660268713, "grad_norm": 5.399099459014794, "learning_rate": 4.781415695443631e-07, "logits/chosen": -1.4662867784500122, "logits/rejected": -1.3914134502410889, "logps/chosen": -505.794921875, "logps/rejected": -595.918701171875, "loss": 0.5759, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.901526689529419, "rewards/margins": 0.8914464116096497, "rewards/rejected": -2.792973041534424, "step": 920 }, { "epoch": 0.22312859884836853, "grad_norm": 5.819110199820003, "learning_rate": 4.772773477316836e-07, "logits/chosen": -1.547885537147522, "logits/rejected": -1.5537118911743164, "logps/chosen": -381.8018493652344, "logps/rejected": -429.70758056640625, "loss": 0.5617, "rewards/accuracies": 0.75, "rewards/chosen": -0.960909366607666, "rewards/margins": 0.4732838273048401, "rewards/rejected": -1.4341931343078613, "step": 930 }, { "epoch": 0.2255278310940499, "grad_norm": 7.075329262381567, "learning_rate": 4.7639718331602117e-07, "logits/chosen": -1.4385401010513306, "logits/rejected": -1.4047901630401611, "logps/chosen": -439.00054931640625, "logps/rejected": -563.8079833984375, "loss": 0.5454, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4129924774169922, "rewards/margins": 1.386842966079712, "rewards/rejected": -2.799834966659546, "step": 940 }, { "epoch": 0.22792706333973128, "grad_norm": 117.88806086127256, "learning_rate": 4.7550113803741275e-07, "logits/chosen": -1.4240310192108154, "logits/rejected": -1.3947417736053467, "logps/chosen": -479.65008544921875, "logps/rejected": -550.0196533203125, "loss": 0.6165, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7085025310516357, "rewards/margins": 1.4387762546539307, "rewards/rejected": -3.1472787857055664, "step": 950 }, { "epoch": 0.23032629558541268, "grad_norm": 7.753373606542773, "learning_rate": 4.7458927474987454e-07, "logits/chosen": -1.4834281206130981, "logits/rejected": -1.4571282863616943, "logps/chosen": -435.71551513671875, "logps/rejected": -423.3853454589844, "loss": 0.5621, "rewards/accuracies": 0.625, "rewards/chosen": -1.0432417392730713, "rewards/margins": 0.4899858832359314, "rewards/rejected": -1.533227562904358, "step": 960 }, { "epoch": 0.23272552783109404, "grad_norm": 35.84391506181939, "learning_rate": 4.7366165741699347e-07, "logits/chosen": -1.540824055671692, "logits/rejected": -1.5162460803985596, "logps/chosen": -473.71514892578125, "logps/rejected": -511.4437561035156, "loss": 0.5601, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2902333736419678, "rewards/margins": 0.7524299025535583, "rewards/rejected": -2.042663097381592, "step": 970 }, { "epoch": 0.23512476007677544, "grad_norm": 6.711770568150169, "learning_rate": 4.727183511074401e-07, "logits/chosen": -1.5611612796783447, "logits/rejected": -1.5459405183792114, "logps/chosen": -390.70880126953125, "logps/rejected": -402.97979736328125, "loss": 0.5557, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9223060607910156, "rewards/margins": 0.2812274992465973, "rewards/rejected": -1.20353364944458, "step": 980 }, { "epoch": 0.2375239923224568, "grad_norm": 7.356819549245005, "learning_rate": 4.717594219904043e-07, "logits/chosen": -1.5465433597564697, "logits/rejected": -1.4800186157226562, "logps/chosen": -394.327880859375, "logps/rejected": -407.1432189941406, "loss": 0.5799, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.086284875869751, "rewards/margins": 0.5457164645195007, "rewards/rejected": -1.632001519203186, "step": 990 }, { "epoch": 0.2399232245681382, "grad_norm": 6.180011768389308, "learning_rate": 4.7078493733095393e-07, "logits/chosen": -1.4510798454284668, "logits/rejected": -1.4394136667251587, "logps/chosen": -414.8174743652344, "logps/rejected": -487.3868713378906, "loss": 0.5292, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4004895687103271, "rewards/margins": 0.7286332845687866, "rewards/rejected": -2.1291232109069824, "step": 1000 }, { "epoch": 0.2423224568138196, "grad_norm": 8.973107903148879, "learning_rate": 4.6979496548531614e-07, "logits/chosen": -1.4409250020980835, "logits/rejected": -1.4732897281646729, "logps/chosen": -402.0752258300781, "logps/rejected": -518.8154907226562, "loss": 0.5541, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2081267833709717, "rewards/margins": 0.7229002714157104, "rewards/rejected": -1.9310270547866821, "step": 1010 }, { "epoch": 0.24472168905950095, "grad_norm": 6.834796795848274, "learning_rate": 4.6878957589608293e-07, "logits/chosen": -1.385780930519104, "logits/rejected": -1.3392270803451538, "logps/chosen": -520.9588012695312, "logps/rejected": -768.1580200195312, "loss": 0.5945, "rewards/accuracies": 0.75, "rewards/chosen": -2.352682590484619, "rewards/margins": 2.0763027667999268, "rewards/rejected": -4.428984642028809, "step": 1020 }, { "epoch": 0.24712092130518235, "grad_norm": 6.338350339612117, "learning_rate": 4.6776883908733956e-07, "logits/chosen": -1.4341570138931274, "logits/rejected": -1.3487894535064697, "logps/chosen": -448.89727783203125, "logps/rejected": -498.39654541015625, "loss": 0.5344, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3556662797927856, "rewards/margins": 1.197887897491455, "rewards/rejected": -2.553554058074951, "step": 1030 }, { "epoch": 0.2495201535508637, "grad_norm": 9.762031523337113, "learning_rate": 4.667328266597178e-07, "logits/chosen": -1.5152060985565186, "logits/rejected": -1.4758002758026123, "logps/chosen": -388.54815673828125, "logps/rejected": -449.3814392089844, "loss": 0.5325, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1459699869155884, "rewards/margins": 0.7172293663024902, "rewards/rejected": -1.863199234008789, "step": 1040 }, { "epoch": 0.2519193857965451, "grad_norm": 6.587552980986566, "learning_rate": 4.6568161128537354e-07, "logits/chosen": -1.5234332084655762, "logits/rejected": -1.397935152053833, "logps/chosen": -427.96697998046875, "logps/rejected": -441.1510314941406, "loss": 0.548, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.4824414253234863, "rewards/margins": 0.7462078332901001, "rewards/rejected": -2.228649139404297, "step": 1050 }, { "epoch": 0.2543186180422265, "grad_norm": 7.250799528397824, "learning_rate": 4.6461526670288877e-07, "logits/chosen": -1.513885259628296, "logits/rejected": -1.4797332286834717, "logps/chosen": -404.1684265136719, "logps/rejected": -435.52703857421875, "loss": 0.5982, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1321500539779663, "rewards/margins": 0.6284357905387878, "rewards/rejected": -1.7605857849121094, "step": 1060 }, { "epoch": 0.2567178502879079, "grad_norm": 6.968262543987022, "learning_rate": 4.635338677120994e-07, "logits/chosen": -1.4521477222442627, "logits/rejected": -1.448061466217041, "logps/chosen": -391.4552917480469, "logps/rejected": -503.6328125, "loss": 0.5347, "rewards/accuracies": 0.75, "rewards/chosen": -1.092247486114502, "rewards/margins": 0.9845672845840454, "rewards/rejected": -2.076814889907837, "step": 1070 }, { "epoch": 0.2591170825335892, "grad_norm": 6.982005120781178, "learning_rate": 4.6243749016884835e-07, "logits/chosen": -1.3785518407821655, "logits/rejected": -1.3828446865081787, "logps/chosen": -446.35888671875, "logps/rejected": -595.894775390625, "loss": 0.5645, "rewards/accuracies": 0.75, "rewards/chosen": -1.5676217079162598, "rewards/margins": 1.0973035097122192, "rewards/rejected": -2.6649250984191895, "step": 1080 }, { "epoch": 0.2615163147792706, "grad_norm": 10.40340346607264, "learning_rate": 4.613262109796645e-07, "logits/chosen": -1.5188038349151611, "logits/rejected": -1.5022757053375244, "logps/chosen": -400.90130615234375, "logps/rejected": -553.9241943359375, "loss": 0.5458, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2826495170593262, "rewards/margins": 1.2281033992767334, "rewards/rejected": -2.5107529163360596, "step": 1090 }, { "epoch": 0.263915547024952, "grad_norm": 6.972298855628279, "learning_rate": 4.602001080963678e-07, "logits/chosen": -1.5422290563583374, "logits/rejected": -1.467437505722046, "logps/chosen": -410.7484436035156, "logps/rejected": -512.1973876953125, "loss": 0.5535, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1985065937042236, "rewards/margins": 1.2838784456253052, "rewards/rejected": -2.4823849201202393, "step": 1100 }, { "epoch": 0.2663147792706334, "grad_norm": 8.549949049117258, "learning_rate": 4.590592605106017e-07, "logits/chosen": -1.5908862352371216, "logits/rejected": -1.5656800270080566, "logps/chosen": -441.91448974609375, "logps/rejected": -512.890869140625, "loss": 0.5669, "rewards/accuracies": 0.875, "rewards/chosen": -1.3182880878448486, "rewards/margins": 1.022831678390503, "rewards/rejected": -2.3411195278167725, "step": 1110 }, { "epoch": 0.2687140115163148, "grad_norm": 7.404227636225907, "learning_rate": 4.5790374824829165e-07, "logits/chosen": -1.3316807746887207, "logits/rejected": -1.3402780294418335, "logps/chosen": -334.657958984375, "logps/rejected": -431.4007873535156, "loss": 0.561, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3125879764556885, "rewards/margins": 0.8587444424629211, "rewards/rejected": -2.1713321208953857, "step": 1120 }, { "epoch": 0.27111324376199614, "grad_norm": 9.086124099951459, "learning_rate": 4.5673365236403216e-07, "logits/chosen": -1.373974323272705, "logits/rejected": -1.3391624689102173, "logps/chosen": -451.37176513671875, "logps/rejected": -618.1118774414062, "loss": 0.5369, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.3380093574523926, "rewards/margins": 1.4104362726211548, "rewards/rejected": -3.748445987701416, "step": 1130 }, { "epoch": 0.27351247600767753, "grad_norm": 8.245489550138963, "learning_rate": 4.5554905493540075e-07, "logits/chosen": -1.3963029384613037, "logits/rejected": -1.3131446838378906, "logps/chosen": -397.52484130859375, "logps/rejected": -545.176025390625, "loss": 0.5366, "rewards/accuracies": 0.75, "rewards/chosen": -1.6280912160873413, "rewards/margins": 1.4453803300857544, "rewards/rejected": -3.0734715461730957, "step": 1140 }, { "epoch": 0.2759117082533589, "grad_norm": 7.262761099946982, "learning_rate": 4.5435003905720074e-07, "logits/chosen": -1.4503087997436523, "logits/rejected": -1.3999178409576416, "logps/chosen": -486.7837829589844, "logps/rejected": -546.538818359375, "loss": 0.5492, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.865114450454712, "rewards/margins": 0.9383285641670227, "rewards/rejected": -2.803443193435669, "step": 1150 }, { "epoch": 0.2783109404990403, "grad_norm": 10.467347582561494, "learning_rate": 4.531366888356324e-07, "logits/chosen": -1.413732886314392, "logits/rejected": -1.3654654026031494, "logps/chosen": -368.93231201171875, "logps/rejected": -577.9190673828125, "loss": 0.527, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6492884159088135, "rewards/margins": 1.7214257717132568, "rewards/rejected": -3.370713710784912, "step": 1160 }, { "epoch": 0.2807101727447217, "grad_norm": 7.656835918634615, "learning_rate": 4.519090893823931e-07, "logits/chosen": -1.4396826028823853, "logits/rejected": -1.4221911430358887, "logps/chosen": -438.3218688964844, "logps/rejected": -529.28759765625, "loss": 0.5347, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.745650291442871, "rewards/margins": 1.0542502403259277, "rewards/rejected": -2.799900531768799, "step": 1170 }, { "epoch": 0.28310940499040305, "grad_norm": 10.339237287269262, "learning_rate": 4.5066732680870734e-07, "logits/chosen": -1.3462697267532349, "logits/rejected": -1.3036195039749146, "logps/chosen": -428.9185485839844, "logps/rejected": -522.67236328125, "loss": 0.5053, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5868905782699585, "rewards/margins": 1.3584994077682495, "rewards/rejected": -2.945390224456787, "step": 1180 }, { "epoch": 0.28550863723608444, "grad_norm": 10.424964890520823, "learning_rate": 4.494114882192862e-07, "logits/chosen": -1.4587008953094482, "logits/rejected": -1.3789886236190796, "logps/chosen": -418.345458984375, "logps/rejected": -561.0228271484375, "loss": 0.5412, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4498519897460938, "rewards/margins": 1.7318111658096313, "rewards/rejected": -3.1816630363464355, "step": 1190 }, { "epoch": 0.28790786948176583, "grad_norm": 9.033374903725427, "learning_rate": 4.4814166170621735e-07, "logits/chosen": -1.4350335597991943, "logits/rejected": -1.4181472063064575, "logps/chosen": -448.14141845703125, "logps/rejected": -524.2359619140625, "loss": 0.5488, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7904564142227173, "rewards/margins": 1.0193628072738647, "rewards/rejected": -2.809818983078003, "step": 1200 }, { "epoch": 0.2903071017274472, "grad_norm": 12.051469646958232, "learning_rate": 4.468579363427858e-07, "logits/chosen": -1.5547723770141602, "logits/rejected": -1.5221188068389893, "logps/chosen": -415.83349609375, "logps/rejected": -476.7373962402344, "loss": 0.5325, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3881736993789673, "rewards/margins": 0.9286211133003235, "rewards/rejected": -2.3167946338653564, "step": 1210 }, { "epoch": 0.2927063339731286, "grad_norm": 8.844148732841079, "learning_rate": 4.4556040217722555e-07, "logits/chosen": -1.574505090713501, "logits/rejected": -1.5706756114959717, "logps/chosen": -370.81610107421875, "logps/rejected": -481.24993896484375, "loss": 0.5232, "rewards/accuracies": 0.75, "rewards/chosen": -1.1100866794586182, "rewards/margins": 0.9274276494979858, "rewards/rejected": -2.0375144481658936, "step": 1220 }, { "epoch": 0.29510556621880996, "grad_norm": 9.594644147905797, "learning_rate": 4.442491502264033e-07, "logits/chosen": -1.4629461765289307, "logits/rejected": -1.4520565271377563, "logps/chosen": -367.5827331542969, "logps/rejected": -431.8439025878906, "loss": 0.5347, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2434320449829102, "rewards/margins": 0.7836133241653442, "rewards/rejected": -2.0270450115203857, "step": 1230 }, { "epoch": 0.29750479846449135, "grad_norm": 6.780108062586533, "learning_rate": 4.429242724694338e-07, "logits/chosen": -1.5323419570922852, "logits/rejected": -1.5191363096237183, "logps/chosen": -375.8507995605469, "logps/rejected": -497.87921142578125, "loss": 0.5549, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.157468557357788, "rewards/margins": 1.0888302326202393, "rewards/rejected": -2.2462985515594482, "step": 1240 }, { "epoch": 0.29990403071017274, "grad_norm": 7.668018750014516, "learning_rate": 4.4158586184122817e-07, "logits/chosen": -1.468441128730774, "logits/rejected": -1.3979756832122803, "logps/chosen": -467.69976806640625, "logps/rejected": -525.319091796875, "loss": 0.5323, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5909717082977295, "rewards/margins": 0.947952926158905, "rewards/rejected": -2.5389246940612793, "step": 1250 }, { "epoch": 0.30230326295585414, "grad_norm": 8.293797053598022, "learning_rate": 4.4023401222597443e-07, "logits/chosen": -1.4755961894989014, "logits/rejected": -1.4515053033828735, "logps/chosen": -433.07147216796875, "logps/rejected": -502.43963623046875, "loss": 0.5421, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.423656702041626, "rewards/margins": 0.8972675204277039, "rewards/rejected": -2.3209242820739746, "step": 1260 }, { "epoch": 0.30470249520153553, "grad_norm": 11.368693082147297, "learning_rate": 4.3886881845055235e-07, "logits/chosen": -1.4548556804656982, "logits/rejected": -1.4107099771499634, "logps/chosen": -393.0867614746094, "logps/rejected": -568.4356689453125, "loss": 0.5143, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3003679513931274, "rewards/margins": 1.8714444637298584, "rewards/rejected": -3.1718125343322754, "step": 1270 }, { "epoch": 0.30710172744721687, "grad_norm": 8.372672642590855, "learning_rate": 4.374903762778814e-07, "logits/chosen": -1.449552297592163, "logits/rejected": -1.375659704208374, "logps/chosen": -478.12615966796875, "logps/rejected": -589.36376953125, "loss": 0.5178, "rewards/accuracies": 0.75, "rewards/chosen": -2.0235326290130615, "rewards/margins": 1.4111472368240356, "rewards/rejected": -3.4346795082092285, "step": 1280 }, { "epoch": 0.30950095969289826, "grad_norm": 10.643231523467081, "learning_rate": 4.3609878240020356e-07, "logits/chosen": -1.5441666841506958, "logits/rejected": -1.4388468265533447, "logps/chosen": -532.8125, "logps/rejected": -604.3128662109375, "loss": 0.5416, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.138451337814331, "rewards/margins": 1.39248788356781, "rewards/rejected": -3.5309395790100098, "step": 1290 }, { "epoch": 0.31190019193857965, "grad_norm": 8.214989448750508, "learning_rate": 4.346941344323005e-07, "logits/chosen": -1.5427402257919312, "logits/rejected": -1.4345190525054932, "logps/chosen": -433.6434631347656, "logps/rejected": -459.6392517089844, "loss": 0.5528, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6685816049575806, "rewards/margins": 0.8387784957885742, "rewards/rejected": -2.5073604583740234, "step": 1300 }, { "epoch": 0.31429942418426104, "grad_norm": 7.657845271019461, "learning_rate": 4.332765309046467e-07, "logits/chosen": -1.3856598138809204, "logits/rejected": -1.3218709230422974, "logps/chosen": -462.48175048828125, "logps/rejected": -564.1286010742188, "loss": 0.5528, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8233705759048462, "rewards/margins": 1.4314063787460327, "rewards/rejected": -3.2547767162323, "step": 1310 }, { "epoch": 0.31669865642994244, "grad_norm": 12.669727176995098, "learning_rate": 4.3184607125649754e-07, "logits/chosen": -1.5354691743850708, "logits/rejected": -1.452030062675476, "logps/chosen": -461.422607421875, "logps/rejected": -695.4813842773438, "loss": 0.5574, "rewards/accuracies": 0.75, "rewards/chosen": -1.5948652029037476, "rewards/margins": 2.2517476081848145, "rewards/rejected": -3.8466124534606934, "step": 1320 }, { "epoch": 0.3190978886756238, "grad_norm": 7.428351039897863, "learning_rate": 4.304028558289141e-07, "logits/chosen": -1.5226895809173584, "logits/rejected": -1.4397214651107788, "logps/chosen": -448.7567443847656, "logps/rejected": -654.158447265625, "loss": 0.5026, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4266167879104614, "rewards/margins": 2.2771382331848145, "rewards/rejected": -3.7037551403045654, "step": 1330 }, { "epoch": 0.32149712092130517, "grad_norm": 8.35394394188628, "learning_rate": 4.28946985857725e-07, "logits/chosen": -1.5027117729187012, "logits/rejected": -1.4603252410888672, "logps/chosen": -464.45770263671875, "logps/rejected": -600.5919189453125, "loss": 0.517, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7185165882110596, "rewards/margins": 1.4774185419082642, "rewards/rejected": -3.195935010910034, "step": 1340 }, { "epoch": 0.32389635316698656, "grad_norm": 8.547450000172507, "learning_rate": 4.2747856346642445e-07, "logits/chosen": -1.6239745616912842, "logits/rejected": -1.5768134593963623, "logps/chosen": -385.5645751953125, "logps/rejected": -461.92413330078125, "loss": 0.498, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4032585620880127, "rewards/margins": 0.9687395095825195, "rewards/rejected": -2.3719983100891113, "step": 1350 }, { "epoch": 0.32629558541266795, "grad_norm": 10.46357929901763, "learning_rate": 4.2599769165900933e-07, "logits/chosen": -1.5189791917800903, "logits/rejected": -1.4443309307098389, "logps/chosen": -452.8273010253906, "logps/rejected": -588.3678588867188, "loss": 0.5361, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9200900793075562, "rewards/margins": 1.5941896438598633, "rewards/rejected": -3.51427960395813, "step": 1360 }, { "epoch": 0.32869481765834935, "grad_norm": 7.921189286394596, "learning_rate": 4.245044743127535e-07, "logits/chosen": -1.5018115043640137, "logits/rejected": -1.5035383701324463, "logps/chosen": -435.9285583496094, "logps/rejected": -535.82275390625, "loss": 0.5308, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5959597826004028, "rewards/margins": 1.0554875135421753, "rewards/rejected": -2.651447057723999, "step": 1370 }, { "epoch": 0.3310940499040307, "grad_norm": 9.780614080293216, "learning_rate": 4.229990161709214e-07, "logits/chosen": -1.4760569334030151, "logits/rejected": -1.4197697639465332, "logps/chosen": -389.590087890625, "logps/rejected": -585.2882690429688, "loss": 0.5347, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.401724100112915, "rewards/margins": 1.7929754257202148, "rewards/rejected": -3.1946990489959717, "step": 1380 }, { "epoch": 0.3334932821497121, "grad_norm": 8.139127977792159, "learning_rate": 4.214814228354204e-07, "logits/chosen": -1.5709788799285889, "logits/rejected": -1.4670157432556152, "logps/chosen": -426.033203125, "logps/rejected": -606.0979614257812, "loss": 0.5143, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.373948335647583, "rewards/margins": 1.9848015308380127, "rewards/rejected": -3.358750104904175, "step": 1390 }, { "epoch": 0.33589251439539347, "grad_norm": 8.460928737411663, "learning_rate": 4.1995180075939375e-07, "logits/chosen": -1.550615668296814, "logits/rejected": -1.5216505527496338, "logps/chosen": -427.553955078125, "logps/rejected": -545.8343505859375, "loss": 0.4946, "rewards/accuracies": 0.75, "rewards/chosen": -1.3485326766967773, "rewards/margins": 1.4248372316360474, "rewards/rejected": -2.773369789123535, "step": 1400 }, { "epoch": 0.33829174664107486, "grad_norm": 8.81270929184022, "learning_rate": 4.1841025723975297e-07, "logits/chosen": -1.5789110660552979, "logits/rejected": -1.5154634714126587, "logps/chosen": -434.64849853515625, "logps/rejected": -600.957763671875, "loss": 0.5008, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.3257498741149902, "rewards/margins": 1.8993675708770752, "rewards/rejected": -3.2251172065734863, "step": 1410 }, { "epoch": 0.34069097888675626, "grad_norm": 10.770138588745654, "learning_rate": 4.168569004096516e-07, "logits/chosen": -1.4905095100402832, "logits/rejected": -1.4610494375228882, "logps/chosen": -411.388427734375, "logps/rejected": -600.2252807617188, "loss": 0.5136, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6003555059432983, "rewards/margins": 1.7819187641143799, "rewards/rejected": -3.3822741508483887, "step": 1420 }, { "epoch": 0.3430902111324376, "grad_norm": 7.293720682350634, "learning_rate": 4.152918392308997e-07, "logits/chosen": -1.4742928743362427, "logits/rejected": -1.4396989345550537, "logps/chosen": -440.1182556152344, "logps/rejected": -565.8653564453125, "loss": 0.5199, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.718984603881836, "rewards/margins": 1.4051283597946167, "rewards/rejected": -3.124112606048584, "step": 1430 }, { "epoch": 0.345489443378119, "grad_norm": 12.227998192379331, "learning_rate": 4.137151834863213e-07, "logits/chosen": -1.5375521183013916, "logits/rejected": -1.4570263624191284, "logps/chosen": -437.3187561035156, "logps/rejected": -679.8615112304688, "loss": 0.5442, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.817325234413147, "rewards/margins": 2.153637409210205, "rewards/rejected": -3.9709632396698, "step": 1440 }, { "epoch": 0.3478886756238004, "grad_norm": 8.40519372330152, "learning_rate": 4.121270437720526e-07, "logits/chosen": -1.6291801929473877, "logits/rejected": -1.6403745412826538, "logps/chosen": -378.547119140625, "logps/rejected": -465.722412109375, "loss": 0.5377, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5566104650497437, "rewards/margins": 0.5313079953193665, "rewards/rejected": -2.087918519973755, "step": 1450 }, { "epoch": 0.3502879078694818, "grad_norm": 7.576024138002559, "learning_rate": 4.105275314897852e-07, "logits/chosen": -1.4507944583892822, "logits/rejected": -1.3628849983215332, "logps/chosen": -424.55401611328125, "logps/rejected": -754.7704467773438, "loss": 0.5183, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7864032983779907, "rewards/margins": 3.053657293319702, "rewards/rejected": -4.840060710906982, "step": 1460 }, { "epoch": 0.35268714011516317, "grad_norm": 9.120638907797822, "learning_rate": 4.089167588389508e-07, "logits/chosen": -1.6330862045288086, "logits/rejected": -1.5513877868652344, "logps/chosen": -524.4133911132812, "logps/rejected": -615.8099975585938, "loss": 0.5284, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.665891408920288, "rewards/margins": 1.4321410655975342, "rewards/rejected": -3.0980327129364014, "step": 1470 }, { "epoch": 0.3550863723608445, "grad_norm": 8.831999384046886, "learning_rate": 4.072948388088515e-07, "logits/chosen": -1.5293561220169067, "logits/rejected": -1.4742053747177124, "logps/chosen": -455.7305603027344, "logps/rejected": -572.6296997070312, "loss": 0.5715, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.669014573097229, "rewards/margins": 1.1838295459747314, "rewards/rejected": -2.852843999862671, "step": 1480 }, { "epoch": 0.3574856046065259, "grad_norm": 8.125729954588683, "learning_rate": 4.056618851707334e-07, "logits/chosen": -1.5556094646453857, "logits/rejected": -1.544398307800293, "logps/chosen": -411.40631103515625, "logps/rejected": -525.8662719726562, "loss": 0.4887, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.1296329498291016, "rewards/margins": 1.2118812799453735, "rewards/rejected": -2.3415141105651855, "step": 1490 }, { "epoch": 0.3598848368522073, "grad_norm": 7.32928329168995, "learning_rate": 4.0401801246980675e-07, "logits/chosen": -1.5943670272827148, "logits/rejected": -1.550929307937622, "logps/chosen": -381.600830078125, "logps/rejected": -475.3746643066406, "loss": 0.5565, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4997680187225342, "rewards/margins": 1.1621274948120117, "rewards/rejected": -2.661895513534546, "step": 1500 }, { "epoch": 0.3622840690978887, "grad_norm": 8.147323008767865, "learning_rate": 4.0236333601721043e-07, "logits/chosen": -1.5589003562927246, "logits/rejected": -1.6057322025299072, "logps/chosen": -448.15496826171875, "logps/rejected": -532.5565185546875, "loss": 0.5517, "rewards/accuracies": 0.875, "rewards/chosen": -1.489591121673584, "rewards/margins": 0.731250524520874, "rewards/rejected": -2.220841646194458, "step": 1510 }, { "epoch": 0.3646833013435701, "grad_norm": 7.889436233563884, "learning_rate": 4.0069797188192364e-07, "logits/chosen": -1.482542872428894, "logits/rejected": -1.3949836492538452, "logps/chosen": -495.69140625, "logps/rejected": -660.7445678710938, "loss": 0.5172, "rewards/accuracies": 0.75, "rewards/chosen": -1.8629268407821655, "rewards/margins": 2.0722007751464844, "rewards/rejected": -3.9351277351379395, "step": 1520 }, { "epoch": 0.3670825335892514, "grad_norm": 10.641539880656538, "learning_rate": 3.9902203688262417e-07, "logits/chosen": -1.554213285446167, "logits/rejected": -1.5207319259643555, "logps/chosen": -438.8204650878906, "logps/rejected": -519.6439208984375, "loss": 0.5133, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5820989608764648, "rewards/margins": 0.9530105590820312, "rewards/rejected": -2.535109281539917, "step": 1530 }, { "epoch": 0.3694817658349328, "grad_norm": 8.725526436669323, "learning_rate": 3.9733564857949365e-07, "logits/chosen": -1.5051288604736328, "logits/rejected": -1.4673130512237549, "logps/chosen": -474.5401306152344, "logps/rejected": -572.7054443359375, "loss": 0.4857, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5164769887924194, "rewards/margins": 1.4391937255859375, "rewards/rejected": -2.9556708335876465, "step": 1540 }, { "epoch": 0.3718809980806142, "grad_norm": 19.17908292592815, "learning_rate": 3.9563892526597177e-07, "logits/chosen": -1.5526840686798096, "logits/rejected": -1.5369585752487183, "logps/chosen": -372.50921630859375, "logps/rejected": -465.1087951660156, "loss": 0.5281, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3534399271011353, "rewards/margins": 0.4993162155151367, "rewards/rejected": -1.852756142616272, "step": 1550 }, { "epoch": 0.3742802303262956, "grad_norm": 7.678163461265528, "learning_rate": 3.9393198596045795e-07, "logits/chosen": -1.5254640579223633, "logits/rejected": -1.5481865406036377, "logps/chosen": -403.49542236328125, "logps/rejected": -521.7468872070312, "loss": 0.5212, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5229865312576294, "rewards/margins": 1.0962460041046143, "rewards/rejected": -2.619232416152954, "step": 1560 }, { "epoch": 0.376679462571977, "grad_norm": 6.910401313727213, "learning_rate": 3.922149503979628e-07, "logits/chosen": -1.4570752382278442, "logits/rejected": -1.3976576328277588, "logps/chosen": -587.8316040039062, "logps/rejected": -922.5247802734375, "loss": 0.4951, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.9414281845092773, "rewards/margins": 3.3815834522247314, "rewards/rejected": -6.323011875152588, "step": 1570 }, { "epoch": 0.3790786948176583, "grad_norm": 10.822868640544474, "learning_rate": 3.904879390217095e-07, "logits/chosen": -1.605613112449646, "logits/rejected": -1.5582932233810425, "logps/chosen": -432.2767028808594, "logps/rejected": -528.9737548828125, "loss": 0.5277, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6335880756378174, "rewards/margins": 1.2116552591323853, "rewards/rejected": -2.845243215560913, "step": 1580 }, { "epoch": 0.3814779270633397, "grad_norm": 9.531595564715918, "learning_rate": 3.8875107297468463e-07, "logits/chosen": -1.5569547414779663, "logits/rejected": -1.5293956995010376, "logps/chosen": -408.19317626953125, "logps/rejected": -679.7545166015625, "loss": 0.5166, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.506165862083435, "rewards/margins": 2.2621097564697266, "rewards/rejected": -3.768275499343872, "step": 1590 }, { "epoch": 0.3838771593090211, "grad_norm": 9.016181079029822, "learning_rate": 3.87004474091141e-07, "logits/chosen": -1.5378257036209106, "logits/rejected": -1.5579806566238403, "logps/chosen": -399.8911437988281, "logps/rejected": -520.0226440429688, "loss": 0.5094, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.619594931602478, "rewards/margins": 1.0523192882537842, "rewards/rejected": -2.6719141006469727, "step": 1600 }, { "epoch": 0.3862763915547025, "grad_norm": 9.569743956165809, "learning_rate": 3.8524826488805114e-07, "logits/chosen": -1.5677305459976196, "logits/rejected": -1.5042650699615479, "logps/chosen": -488.653564453125, "logps/rejected": -553.4172973632812, "loss": 0.57, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8398549556732178, "rewards/margins": 1.1457983255386353, "rewards/rejected": -2.9856534004211426, "step": 1610 }, { "epoch": 0.3886756238003839, "grad_norm": 7.953020161460846, "learning_rate": 3.834825685565133e-07, "logits/chosen": -1.5813748836517334, "logits/rejected": -1.5757431983947754, "logps/chosen": -361.91192626953125, "logps/rejected": -377.3770446777344, "loss": 0.4998, "rewards/accuracies": 0.75, "rewards/chosen": -1.096151351928711, "rewards/margins": 0.5215452313423157, "rewards/rejected": -1.6176965236663818, "step": 1620 }, { "epoch": 0.39107485604606523, "grad_norm": 12.112559070938909, "learning_rate": 3.8170750895311007e-07, "logits/chosen": -1.661161184310913, "logits/rejected": -1.604376196861267, "logps/chosen": -426.1370544433594, "logps/rejected": -517.6278076171875, "loss": 0.4749, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2490683794021606, "rewards/margins": 1.1713922023773193, "rewards/rejected": -2.4204607009887695, "step": 1630 }, { "epoch": 0.3934740882917466, "grad_norm": 10.613017352814472, "learning_rate": 3.7992321059122045e-07, "logits/chosen": -1.477107286453247, "logits/rejected": -1.435372233390808, "logps/chosen": -473.62750244140625, "logps/rejected": -581.3477172851562, "loss": 0.5086, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0801384449005127, "rewards/margins": 1.3069989681243896, "rewards/rejected": -3.3871371746063232, "step": 1640 }, { "epoch": 0.395873320537428, "grad_norm": 8.03177266316891, "learning_rate": 3.7812979863228576e-07, "logits/chosen": -1.4891306161880493, "logits/rejected": -1.524371862411499, "logps/chosen": -451.3934631347656, "logps/rejected": -560.3739013671875, "loss": 0.4753, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.225372314453125, "rewards/margins": 0.9865644574165344, "rewards/rejected": -3.2119364738464355, "step": 1650 }, { "epoch": 0.3982725527831094, "grad_norm": 14.176933508927005, "learning_rate": 3.763273988770296e-07, "logits/chosen": -1.4202697277069092, "logits/rejected": -1.3632880449295044, "logps/chosen": -510.245361328125, "logps/rejected": -665.1488037109375, "loss": 0.4931, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.517984628677368, "rewards/margins": 1.563971757888794, "rewards/rejected": -4.081956386566162, "step": 1660 }, { "epoch": 0.4006717850287908, "grad_norm": 10.323789347042394, "learning_rate": 3.7451613775663405e-07, "logits/chosen": -1.5507957935333252, "logits/rejected": -1.4645434617996216, "logps/chosen": -424.00042724609375, "logps/rejected": -677.9902954101562, "loss": 0.5348, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6596157550811768, "rewards/margins": 2.5375232696533203, "rewards/rejected": -4.197138786315918, "step": 1670 }, { "epoch": 0.40307101727447214, "grad_norm": 13.602368967903537, "learning_rate": 3.726961423238706e-07, "logits/chosen": -1.5984694957733154, "logits/rejected": -1.5915277004241943, "logps/chosen": -382.999755859375, "logps/rejected": -552.7100830078125, "loss": 0.5241, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.413910150527954, "rewards/margins": 1.5153096914291382, "rewards/rejected": -2.9292197227478027, "step": 1680 }, { "epoch": 0.40547024952015354, "grad_norm": 7.502258208680551, "learning_rate": 3.708675402441882e-07, "logits/chosen": -1.6568260192871094, "logits/rejected": -1.5876600742340088, "logps/chosen": -440.16357421875, "logps/rejected": -474.075439453125, "loss": 0.5576, "rewards/accuracies": 0.625, "rewards/chosen": -1.4147294759750366, "rewards/margins": 0.780114471912384, "rewards/rejected": -2.1948440074920654, "step": 1690 }, { "epoch": 0.40786948176583493, "grad_norm": 8.66494523712904, "learning_rate": 3.6903045978675775e-07, "logits/chosen": -1.562082052230835, "logits/rejected": -1.4948246479034424, "logps/chosen": -393.52508544921875, "logps/rejected": -548.5087280273438, "loss": 0.5081, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3889000415802002, "rewards/margins": 1.800432562828064, "rewards/rejected": -3.1893324851989746, "step": 1700 }, { "epoch": 0.4102687140115163, "grad_norm": 8.12061625541492, "learning_rate": 3.6718502981547474e-07, "logits/chosen": -1.6216672658920288, "logits/rejected": -1.6438817977905273, "logps/chosen": -408.8639221191406, "logps/rejected": -543.5513916015625, "loss": 0.5151, "rewards/accuracies": 0.75, "rewards/chosen": -1.3934214115142822, "rewards/margins": 0.9635807275772095, "rewards/rejected": -2.357002019882202, "step": 1710 }, { "epoch": 0.4126679462571977, "grad_norm": 7.83467861319349, "learning_rate": 3.6533137977991986e-07, "logits/chosen": -1.6941850185394287, "logits/rejected": -1.6553080081939697, "logps/chosen": -461.6195373535156, "logps/rejected": -582.6620483398438, "loss": 0.5341, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6303510665893555, "rewards/margins": 0.9588174819946289, "rewards/rejected": -2.5891685485839844, "step": 1720 }, { "epoch": 0.41506717850287905, "grad_norm": 7.244089308316078, "learning_rate": 3.6346963970627865e-07, "logits/chosen": -1.5541958808898926, "logits/rejected": -1.5294215679168701, "logps/chosen": -429.1705017089844, "logps/rejected": -555.658447265625, "loss": 0.5078, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.70223867893219, "rewards/margins": 1.1579748392105103, "rewards/rejected": -2.8602135181427, "step": 1730 }, { "epoch": 0.41746641074856045, "grad_norm": 22.58474812752272, "learning_rate": 3.615999401882207e-07, "logits/chosen": -1.4961183071136475, "logits/rejected": -1.4009946584701538, "logps/chosen": -451.9063415527344, "logps/rejected": -685.244140625, "loss": 0.5014, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.072859764099121, "rewards/margins": 2.2618536949157715, "rewards/rejected": -4.334712982177734, "step": 1740 }, { "epoch": 0.41986564299424184, "grad_norm": 8.32162712301176, "learning_rate": 3.597224123777389e-07, "logits/chosen": -1.5286868810653687, "logits/rejected": -1.387584924697876, "logps/chosen": -498.414306640625, "logps/rejected": -741.4631958007812, "loss": 0.5284, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.277275800704956, "rewards/margins": 2.3793492317199707, "rewards/rejected": -4.656625747680664, "step": 1750 }, { "epoch": 0.42226487523992323, "grad_norm": 8.974293009829895, "learning_rate": 3.5783718797595e-07, "logits/chosen": -1.6080338954925537, "logits/rejected": -1.4449987411499023, "logps/chosen": -483.98486328125, "logps/rejected": -583.08056640625, "loss": 0.4899, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6723836660385132, "rewards/margins": 1.4705579280853271, "rewards/rejected": -3.1429412364959717, "step": 1760 }, { "epoch": 0.4246641074856046, "grad_norm": 9.046062404092567, "learning_rate": 3.559443992238558e-07, "logits/chosen": -1.6072918176651, "logits/rejected": -1.5334550142288208, "logps/chosen": -404.91387939453125, "logps/rejected": -659.3243408203125, "loss": 0.5256, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4262123107910156, "rewards/margins": 2.3509132862091064, "rewards/rejected": -3.777125835418701, "step": 1770 }, { "epoch": 0.42706333973128596, "grad_norm": 7.551979886612425, "learning_rate": 3.540441788930673e-07, "logits/chosen": -1.5570323467254639, "logits/rejected": -1.5010316371917725, "logps/chosen": -499.6627502441406, "logps/rejected": -655.69775390625, "loss": 0.4806, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.816043496131897, "rewards/margins": 2.008720636367798, "rewards/rejected": -3.8247642517089844, "step": 1780 }, { "epoch": 0.42946257197696736, "grad_norm": 8.307725059400688, "learning_rate": 3.5213666027649123e-07, "logits/chosen": -1.5870680809020996, "logits/rejected": -1.5261785984039307, "logps/chosen": -484.54364013671875, "logps/rejected": -549.2987060546875, "loss": 0.5261, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8930203914642334, "rewards/margins": 1.094689965248108, "rewards/rejected": -2.987710475921631, "step": 1790 }, { "epoch": 0.43186180422264875, "grad_norm": 8.582762739992743, "learning_rate": 3.5022197717898017e-07, "logits/chosen": -1.580644130706787, "logits/rejected": -1.3994085788726807, "logps/chosen": -401.8323059082031, "logps/rejected": -603.8817138671875, "loss": 0.4704, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.661513328552246, "rewards/margins": 2.428684949874878, "rewards/rejected": -4.090198516845703, "step": 1800 }, { "epoch": 0.43426103646833014, "grad_norm": 10.93719877269778, "learning_rate": 3.4830026390794633e-07, "logits/chosen": -1.607084035873413, "logits/rejected": -1.5272046327590942, "logps/chosen": -510.1648864746094, "logps/rejected": -667.7628173828125, "loss": 0.4909, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0123143196105957, "rewards/margins": 2.094109535217285, "rewards/rejected": -4.106423377990723, "step": 1810 }, { "epoch": 0.43666026871401153, "grad_norm": 8.551853777479481, "learning_rate": 3.4637165526394104e-07, "logits/chosen": -1.6525242328643799, "logits/rejected": -1.6364984512329102, "logps/chosen": -405.4945983886719, "logps/rejected": -535.1358032226562, "loss": 0.4973, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5821750164031982, "rewards/margins": 1.2321652173995972, "rewards/rejected": -2.814340114593506, "step": 1820 }, { "epoch": 0.43905950095969287, "grad_norm": 7.385716657081044, "learning_rate": 3.4443628653119814e-07, "logits/chosen": -1.517671823501587, "logits/rejected": -1.4017354249954224, "logps/chosen": -517.3212890625, "logps/rejected": -859.2703247070312, "loss": 0.5639, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.307370662689209, "rewards/margins": 3.0285544395446777, "rewards/rejected": -5.335925102233887, "step": 1830 }, { "epoch": 0.44145873320537427, "grad_norm": 9.147598141998557, "learning_rate": 3.424942934681453e-07, "logits/chosen": -1.603632926940918, "logits/rejected": -1.50962233543396, "logps/chosen": -405.5295715332031, "logps/rejected": -603.0396728515625, "loss": 0.4915, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5191326141357422, "rewards/margins": 2.0283052921295166, "rewards/rejected": -3.5474376678466797, "step": 1840 }, { "epoch": 0.44385796545105566, "grad_norm": 10.988746391986139, "learning_rate": 3.405458122978804e-07, "logits/chosen": -1.5768574476242065, "logits/rejected": -1.5998786687850952, "logps/chosen": -428.5262756347656, "logps/rejected": -487.67987060546875, "loss": 0.4836, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3609421253204346, "rewards/margins": 0.9485089182853699, "rewards/rejected": -2.309451103210449, "step": 1850 }, { "epoch": 0.44625719769673705, "grad_norm": 15.944164556320738, "learning_rate": 3.3859097969861633e-07, "logits/chosen": -1.625605583190918, "logits/rejected": -1.5874695777893066, "logps/chosen": -494.60919189453125, "logps/rejected": -607.7193603515625, "loss": 0.5338, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8808681964874268, "rewards/margins": 1.5208300352096558, "rewards/rejected": -3.401698350906372, "step": 1860 }, { "epoch": 0.44865642994241844, "grad_norm": 8.268100089151192, "learning_rate": 3.366299327940936e-07, "logits/chosen": -1.6645658016204834, "logits/rejected": -1.648751974105835, "logps/chosen": -499.26336669921875, "logps/rejected": -698.3704833984375, "loss": 0.4915, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9141197204589844, "rewards/margins": 1.823472261428833, "rewards/rejected": -3.7375919818878174, "step": 1870 }, { "epoch": 0.4510556621880998, "grad_norm": 8.804078070830371, "learning_rate": 3.3466280914396117e-07, "logits/chosen": -1.5872663259506226, "logits/rejected": -1.5196198225021362, "logps/chosen": -456.85992431640625, "logps/rejected": -658.8201904296875, "loss": 0.4869, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.035572052001953, "rewards/margins": 1.8471441268920898, "rewards/rejected": -3.882716417312622, "step": 1880 }, { "epoch": 0.4534548944337812, "grad_norm": 11.092976917744208, "learning_rate": 3.326897467341281e-07, "logits/chosen": -1.5509458780288696, "logits/rejected": -1.4662238359451294, "logps/chosen": -485.05230712890625, "logps/rejected": -712.2886962890625, "loss": 0.4878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.6332812309265137, "rewards/margins": 2.179011821746826, "rewards/rejected": -4.81229305267334, "step": 1890 }, { "epoch": 0.45585412667946257, "grad_norm": 12.839651291896095, "learning_rate": 3.3071088396708335e-07, "logits/chosen": -1.532083511352539, "logits/rejected": -1.428544282913208, "logps/chosen": -468.669921875, "logps/rejected": -751.4425048828125, "loss": 0.5366, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.547771692276001, "rewards/margins": 2.576319932937622, "rewards/rejected": -5.124091148376465, "step": 1900 }, { "epoch": 0.45825335892514396, "grad_norm": 10.082888820884584, "learning_rate": 3.2872635965218824e-07, "logits/chosen": -1.407777190208435, "logits/rejected": -1.3572627305984497, "logps/chosen": -552.7352294921875, "logps/rejected": -779.3690185546875, "loss": 0.5304, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7334177494049072, "rewards/margins": 2.2001953125, "rewards/rejected": -4.93361234664917, "step": 1910 }, { "epoch": 0.46065259117082535, "grad_norm": 8.22503563553893, "learning_rate": 3.2673631299593905e-07, "logits/chosen": -1.5405181646347046, "logits/rejected": -1.3945229053497314, "logps/chosen": -509.95916748046875, "logps/rejected": -725.9969482421875, "loss": 0.4924, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.202409267425537, "rewards/margins": 2.3135194778442383, "rewards/rejected": -4.515929222106934, "step": 1920 }, { "epoch": 0.4630518234165067, "grad_norm": 11.990528680107209, "learning_rate": 3.247408835922024e-07, "logits/chosen": -1.4919646978378296, "logits/rejected": -1.400061845779419, "logps/chosen": -584.2794189453125, "logps/rejected": -807.6492919921875, "loss": 0.5074, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.5771946907043457, "rewards/margins": 2.1979637145996094, "rewards/rejected": -4.775158882141113, "step": 1930 }, { "epoch": 0.4654510556621881, "grad_norm": 11.277957093878648, "learning_rate": 3.2274021141242306e-07, "logits/chosen": -1.4834940433502197, "logits/rejected": -1.4324702024459839, "logps/chosen": -476.18499755859375, "logps/rejected": -635.02783203125, "loss": 0.501, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0429248809814453, "rewards/margins": 1.4896948337554932, "rewards/rejected": -3.5326199531555176, "step": 1940 }, { "epoch": 0.4678502879078695, "grad_norm": 15.301957388985223, "learning_rate": 3.2073443679580613e-07, "logits/chosen": -1.6484432220458984, "logits/rejected": -1.6361116170883179, "logps/chosen": -451.64276123046875, "logps/rejected": -508.2003479003906, "loss": 0.5063, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6862659454345703, "rewards/margins": 0.613060712814331, "rewards/rejected": -2.2993264198303223, "step": 1950 }, { "epoch": 0.47024952015355087, "grad_norm": 9.017778030476915, "learning_rate": 3.1872370043947194e-07, "logits/chosen": -1.6414830684661865, "logits/rejected": -1.564360499382019, "logps/chosen": -421.6163024902344, "logps/rejected": -599.665283203125, "loss": 0.4565, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4122569561004639, "rewards/margins": 1.8497397899627686, "rewards/rejected": -3.2619965076446533, "step": 1960 }, { "epoch": 0.47264875239923226, "grad_norm": 13.966059992110349, "learning_rate": 3.167081433885874e-07, "logits/chosen": -1.4791144132614136, "logits/rejected": -1.4647550582885742, "logps/chosen": -573.87353515625, "logps/rejected": -760.268798828125, "loss": 0.4637, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.4180257320404053, "rewards/margins": 1.517680287361145, "rewards/rejected": -3.935706377029419, "step": 1970 }, { "epoch": 0.4750479846449136, "grad_norm": 14.922819489110463, "learning_rate": 3.14687907026472e-07, "logits/chosen": -1.5097310543060303, "logits/rejected": -1.4952431917190552, "logps/chosen": -426.15045166015625, "logps/rejected": -578.3543090820312, "loss": 0.4975, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8940229415893555, "rewards/margins": 1.3460915088653564, "rewards/rejected": -3.240114212036133, "step": 1980 }, { "epoch": 0.477447216890595, "grad_norm": 8.988836306290604, "learning_rate": 3.126631330646801e-07, "logits/chosen": -1.6298195123672485, "logits/rejected": -1.6404377222061157, "logps/chosen": -538.1989135742188, "logps/rejected": -648.9851684570312, "loss": 0.5374, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.212888240814209, "rewards/margins": 1.0854294300079346, "rewards/rejected": -3.2983174324035645, "step": 1990 }, { "epoch": 0.4798464491362764, "grad_norm": 8.750744135615523, "learning_rate": 3.1063396353306097e-07, "logits/chosen": -1.643333077430725, "logits/rejected": -1.610515832901001, "logps/chosen": -407.2403869628906, "logps/rejected": -473.98858642578125, "loss": 0.5034, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2611894607543945, "rewards/margins": 1.0861375331878662, "rewards/rejected": -2.3473267555236816, "step": 2000 }, { "epoch": 0.4798464491362764, "eval_logits/chosen": -1.5855597257614136, "eval_logits/rejected": -1.54659903049469, "eval_logps/chosen": -422.9024963378906, "eval_logps/rejected": -588.136474609375, "eval_loss": 0.4988311231136322, "eval_rewards/accuracies": 0.7982142567634583, "eval_rewards/chosen": -1.505989670753479, "eval_rewards/margins": 1.6387678384780884, "eval_rewards/rejected": -3.1447572708129883, "eval_runtime": 52.378, "eval_samples_per_second": 85.169, "eval_steps_per_second": 1.336, "step": 2000 }, { "epoch": 0.4822456813819578, "grad_norm": 13.733129427049343, "learning_rate": 3.0860054076979535e-07, "logits/chosen": -1.6123840808868408, "logits/rejected": -1.5536229610443115, "logps/chosen": -480.535888671875, "logps/rejected": -624.6920166015625, "loss": 0.4772, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8841956853866577, "rewards/margins": 1.7770442962646484, "rewards/rejected": -3.6612396240234375, "step": 2010 }, { "epoch": 0.4846449136276392, "grad_norm": 9.057968429745728, "learning_rate": 3.065630074114115e-07, "logits/chosen": -1.6468747854232788, "logits/rejected": -1.5576668977737427, "logps/chosen": -458.7217712402344, "logps/rejected": -609.1342163085938, "loss": 0.5354, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5887041091918945, "rewards/margins": 1.9728997945785522, "rewards/rejected": -3.5616040229797363, "step": 2020 }, { "epoch": 0.4870441458733205, "grad_norm": 9.014043112611292, "learning_rate": 3.0452150638277947e-07, "logits/chosen": -1.602269172668457, "logits/rejected": -1.5746511220932007, "logps/chosen": -426.79180908203125, "logps/rejected": -550.3716430664062, "loss": 0.4984, "rewards/accuracies": 0.75, "rewards/chosen": -1.901236891746521, "rewards/margins": 1.115099310874939, "rewards/rejected": -3.016335964202881, "step": 2030 }, { "epoch": 0.4894433781190019, "grad_norm": 10.239650017211535, "learning_rate": 3.024761808870856e-07, "logits/chosen": -1.537592887878418, "logits/rejected": -1.5135307312011719, "logps/chosen": -394.8808898925781, "logps/rejected": -632.0445556640625, "loss": 0.4599, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.3750698566436768, "rewards/margins": 2.4639463424682617, "rewards/rejected": -3.8390164375305176, "step": 2040 }, { "epoch": 0.4918426103646833, "grad_norm": 15.750043165793306, "learning_rate": 3.004271743957875e-07, "logits/chosen": -1.6159226894378662, "logits/rejected": -1.655556321144104, "logps/chosen": -525.2373657226562, "logps/rejected": -622.0975341796875, "loss": 0.5105, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.482973098754883, "rewards/margins": 0.6949129700660706, "rewards/rejected": -3.1778860092163086, "step": 2050 }, { "epoch": 0.4942418426103647, "grad_norm": 8.499109074782012, "learning_rate": 2.983746306385499e-07, "logits/chosen": -1.5946094989776611, "logits/rejected": -1.534623622894287, "logps/chosen": -453.0174255371094, "logps/rejected": -686.3258056640625, "loss": 0.521, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9426515102386475, "rewards/margins": 2.2058510780334473, "rewards/rejected": -4.148502349853516, "step": 2060 }, { "epoch": 0.4966410748560461, "grad_norm": 10.800111792066579, "learning_rate": 2.963186935931628e-07, "logits/chosen": -1.6709175109863281, "logits/rejected": -1.6887776851654053, "logps/chosen": -438.7301330566406, "logps/rejected": -554.9039306640625, "loss": 0.4837, "rewards/accuracies": 0.875, "rewards/chosen": -1.6398264169692993, "rewards/margins": 1.2201743125915527, "rewards/rejected": -2.8600010871887207, "step": 2070 }, { "epoch": 0.4990403071017274, "grad_norm": 8.988674035333007, "learning_rate": 2.9425950747544176e-07, "logits/chosen": -1.5756020545959473, "logits/rejected": -1.5436617136001587, "logps/chosen": -561.7062377929688, "logps/rejected": -795.3497314453125, "loss": 0.4775, "rewards/accuracies": 0.75, "rewards/chosen": -2.4821224212646484, "rewards/margins": 2.639726161956787, "rewards/rejected": -5.121848106384277, "step": 2080 }, { "epoch": 0.5014395393474088, "grad_norm": 10.085891400781119, "learning_rate": 2.921972167291119e-07, "logits/chosen": -1.6471210718154907, "logits/rejected": -1.6008189916610718, "logps/chosen": -478.99835205078125, "logps/rejected": -686.2216186523438, "loss": 0.4919, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7925002574920654, "rewards/margins": 1.8971786499023438, "rewards/rejected": -3.689678192138672, "step": 2090 }, { "epoch": 0.5038387715930902, "grad_norm": 10.421882065221322, "learning_rate": 2.9013196601567567e-07, "logits/chosen": -1.6900784969329834, "logits/rejected": -1.6556028127670288, "logps/chosen": -423.0419921875, "logps/rejected": -528.6862182617188, "loss": 0.5545, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5560649633407593, "rewards/margins": 0.938676655292511, "rewards/rejected": -2.494741916656494, "step": 2100 }, { "epoch": 0.5062380038387716, "grad_norm": 8.821766288439926, "learning_rate": 2.8806390020426555e-07, "logits/chosen": -1.662415862083435, "logits/rejected": -1.6548817157745361, "logps/chosen": -453.2157287597656, "logps/rejected": -591.32470703125, "loss": 0.5049, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6261285543441772, "rewards/margins": 1.4008034467697144, "rewards/rejected": -3.0269322395324707, "step": 2110 }, { "epoch": 0.508637236084453, "grad_norm": 12.186841933449609, "learning_rate": 2.8599316436148187e-07, "logits/chosen": -1.5856435298919678, "logits/rejected": -1.5153669118881226, "logps/chosen": -424.302490234375, "logps/rejected": -558.4691162109375, "loss": 0.4916, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5989717245101929, "rewards/margins": 1.3942369222640991, "rewards/rejected": -2.993208646774292, "step": 2120 }, { "epoch": 0.5110364683301344, "grad_norm": 9.950542312444133, "learning_rate": 2.8391990374121723e-07, "logits/chosen": -1.58291494846344, "logits/rejected": -1.506280541419983, "logps/chosen": -473.59130859375, "logps/rejected": -741.8995361328125, "loss": 0.4944, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.1417229175567627, "rewards/margins": 2.387235641479492, "rewards/rejected": -4.528958320617676, "step": 2130 }, { "epoch": 0.5134357005758158, "grad_norm": 9.848738060883088, "learning_rate": 2.818442637744669e-07, "logits/chosen": -1.5625739097595215, "logits/rejected": -1.5479421615600586, "logps/chosen": -466.6238708496094, "logps/rejected": -644.1817626953125, "loss": 0.4903, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0364062786102295, "rewards/margins": 1.7278406620025635, "rewards/rejected": -3.764247417449951, "step": 2140 }, { "epoch": 0.5158349328214972, "grad_norm": 15.060228171309086, "learning_rate": 2.797663900591284e-07, "logits/chosen": -1.6554969549179077, "logits/rejected": -1.590504765510559, "logps/chosen": -499.38653564453125, "logps/rejected": -656.7408447265625, "loss": 0.4641, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.2184956073760986, "rewards/margins": 1.8517191410064697, "rewards/rejected": -4.07021427154541, "step": 2150 }, { "epoch": 0.5182341650671785, "grad_norm": 11.311863584668101, "learning_rate": 2.776864283497874e-07, "logits/chosen": -1.6083641052246094, "logits/rejected": -1.4705779552459717, "logps/chosen": -442.3028869628906, "logps/rejected": -744.4788818359375, "loss": 0.4928, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9584490060806274, "rewards/margins": 3.1203114986419678, "rewards/rejected": -5.078760623931885, "step": 2160 }, { "epoch": 0.5206333973128598, "grad_norm": 10.28461247325414, "learning_rate": 2.756045245474943e-07, "logits/chosen": -1.7539141178131104, "logits/rejected": -1.7315905094146729, "logps/chosen": -480.287353515625, "logps/rejected": -616.8678588867188, "loss": 0.5098, "rewards/accuracies": 0.75, "rewards/chosen": -2.0116958618164062, "rewards/margins": 1.2387454509735107, "rewards/rejected": -3.250441312789917, "step": 2170 }, { "epoch": 0.5230326295585412, "grad_norm": 9.31477755841552, "learning_rate": 2.7352082468952977e-07, "logits/chosen": -1.5500714778900146, "logits/rejected": -1.497689962387085, "logps/chosen": -472.05908203125, "logps/rejected": -784.3190307617188, "loss": 0.5352, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0980050563812256, "rewards/margins": 2.975919485092163, "rewards/rejected": -5.073924541473389, "step": 2180 }, { "epoch": 0.5254318618042226, "grad_norm": 9.270464819675132, "learning_rate": 2.7143547493916e-07, "logits/chosen": -1.610926866531372, "logits/rejected": -1.4779975414276123, "logps/chosen": -438.76495361328125, "logps/rejected": -772.690673828125, "loss": 0.4729, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.654784917831421, "rewards/margins": 3.3569869995117188, "rewards/rejected": -5.011772632598877, "step": 2190 }, { "epoch": 0.527831094049904, "grad_norm": 11.881300504373025, "learning_rate": 2.693486215753853e-07, "logits/chosen": -1.6387016773223877, "logits/rejected": -1.5379244089126587, "logps/chosen": -487.00250244140625, "logps/rejected": -804.3506469726562, "loss": 0.5173, "rewards/accuracies": 0.75, "rewards/chosen": -2.203137159347534, "rewards/margins": 3.4220173358917236, "rewards/rejected": -5.625154495239258, "step": 2200 }, { "epoch": 0.5302303262955854, "grad_norm": 11.209078276132704, "learning_rate": 2.6726041098267805e-07, "logits/chosen": -1.7330009937286377, "logits/rejected": -1.610637903213501, "logps/chosen": -492.36883544921875, "logps/rejected": -618.7637329101562, "loss": 0.5552, "rewards/accuracies": 0.625, "rewards/chosen": -1.8613355159759521, "rewards/margins": 1.652645468711853, "rewards/rejected": -3.5139803886413574, "step": 2210 }, { "epoch": 0.5326295585412668, "grad_norm": 18.552236297372197, "learning_rate": 2.6517098964071507e-07, "logits/chosen": -1.6331803798675537, "logits/rejected": -1.648450493812561, "logps/chosen": -422.33184814453125, "logps/rejected": -502.978271484375, "loss": 0.5321, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5346616506576538, "rewards/margins": 0.6287662386894226, "rewards/rejected": -2.1634278297424316, "step": 2220 }, { "epoch": 0.5350287907869482, "grad_norm": 21.4582048998974, "learning_rate": 2.630805041141023e-07, "logits/chosen": -1.5773918628692627, "logits/rejected": -1.5001894235610962, "logps/chosen": -385.9847412109375, "logps/rejected": -727.9683837890625, "loss": 0.489, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4480630159378052, "rewards/margins": 3.2581539154052734, "rewards/rejected": -4.706217288970947, "step": 2230 }, { "epoch": 0.5374280230326296, "grad_norm": 26.843224623928677, "learning_rate": 2.609891010420941e-07, "logits/chosen": -1.6596715450286865, "logits/rejected": -1.5660836696624756, "logps/chosen": -512.3265380859375, "logps/rejected": -711.9317016601562, "loss": 0.4656, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.2569899559020996, "rewards/margins": 1.9923429489135742, "rewards/rejected": -4.249333381652832, "step": 2240 }, { "epoch": 0.539827255278311, "grad_norm": 10.63742659021482, "learning_rate": 2.5889692712830674e-07, "logits/chosen": -1.7071069478988647, "logits/rejected": -1.6586761474609375, "logps/chosen": -430.1871643066406, "logps/rejected": -572.591064453125, "loss": 0.4918, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8596538305282593, "rewards/margins": 1.5033780336380005, "rewards/rejected": -3.363032102584839, "step": 2250 }, { "epoch": 0.5422264875239923, "grad_norm": 14.567934039227358, "learning_rate": 2.5680412913042843e-07, "logits/chosen": -1.5422347784042358, "logits/rejected": -1.4246468544006348, "logps/chosen": -479.8147888183594, "logps/rejected": -750.625, "loss": 0.5143, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.149322986602783, "rewards/margins": 2.762144088745117, "rewards/rejected": -4.911467552185059, "step": 2260 }, { "epoch": 0.5446257197696737, "grad_norm": 17.356455187998073, "learning_rate": 2.5471085384992404e-07, "logits/chosen": -1.5986369848251343, "logits/rejected": -1.5308504104614258, "logps/chosen": -437.1161193847656, "logps/rejected": -754.64453125, "loss": 0.4815, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.717659592628479, "rewards/margins": 3.181501626968384, "rewards/rejected": -4.899160861968994, "step": 2270 }, { "epoch": 0.5470249520153551, "grad_norm": 9.28953605758801, "learning_rate": 2.526172481217381e-07, "logits/chosen": -1.5438369512557983, "logits/rejected": -1.5630390644073486, "logps/chosen": -416.0315856933594, "logps/rejected": -573.1238403320312, "loss": 0.5108, "rewards/accuracies": 0.75, "rewards/chosen": -1.984635591506958, "rewards/margins": 1.3827749490737915, "rewards/rejected": -3.367410182952881, "step": 2280 }, { "epoch": 0.5494241842610365, "grad_norm": 8.596639384497214, "learning_rate": 2.5052345880399456e-07, "logits/chosen": -1.6020433902740479, "logits/rejected": -1.558334469795227, "logps/chosen": -431.74029541015625, "logps/rejected": -551.8709106445312, "loss": 0.4808, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8813968896865845, "rewards/margins": 1.2201679944992065, "rewards/rejected": -3.10156512260437, "step": 2290 }, { "epoch": 0.5518234165067178, "grad_norm": 11.753794107565938, "learning_rate": 2.4842963276769555e-07, "logits/chosen": -1.5615367889404297, "logits/rejected": -1.5330339670181274, "logps/chosen": -437.43084716796875, "logps/rejected": -621.0929565429688, "loss": 0.5155, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.09666109085083, "rewards/margins": 1.3752198219299316, "rewards/rejected": -3.4718806743621826, "step": 2300 }, { "epoch": 0.5542226487523992, "grad_norm": 10.071023503443175, "learning_rate": 2.463359168864189e-07, "logits/chosen": -1.6465524435043335, "logits/rejected": -1.646805763244629, "logps/chosen": -486.73565673828125, "logps/rejected": -566.8058471679688, "loss": 0.5368, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7878824472427368, "rewards/margins": 1.157956838607788, "rewards/rejected": -2.9458394050598145, "step": 2310 }, { "epoch": 0.5566218809980806, "grad_norm": 11.942512663088241, "learning_rate": 2.4424245802601555e-07, "logits/chosen": -1.6439406871795654, "logits/rejected": -1.6925067901611328, "logps/chosen": -371.7730407714844, "logps/rejected": -524.0623168945312, "loss": 0.4857, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3829593658447266, "rewards/margins": 0.9715530276298523, "rewards/rejected": -2.3545122146606445, "step": 2320 }, { "epoch": 0.559021113243762, "grad_norm": 12.373131204862757, "learning_rate": 2.421494030343072e-07, "logits/chosen": -1.5384362936019897, "logits/rejected": -1.480912446975708, "logps/chosen": -452.34100341796875, "logps/rejected": -508.7037048339844, "loss": 0.5744, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7134408950805664, "rewards/margins": 1.0949541330337524, "rewards/rejected": -2.808394432067871, "step": 2330 }, { "epoch": 0.5614203454894434, "grad_norm": 12.389004819032923, "learning_rate": 2.400568987307861e-07, "logits/chosen": -1.5702178478240967, "logits/rejected": -1.5472772121429443, "logps/chosen": -392.81146240234375, "logps/rejected": -411.69317626953125, "loss": 0.4819, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4531047344207764, "rewards/margins": 0.3802509605884552, "rewards/rejected": -1.8333555459976196, "step": 2340 }, { "epoch": 0.5638195777351248, "grad_norm": 10.648864731231448, "learning_rate": 2.379650918963156e-07, "logits/chosen": -1.6462970972061157, "logits/rejected": -1.6218713521957397, "logps/chosen": -369.7726745605469, "logps/rejected": -520.507080078125, "loss": 0.5095, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6180322170257568, "rewards/margins": 1.4091498851776123, "rewards/rejected": -3.027182102203369, "step": 2350 }, { "epoch": 0.5662188099808061, "grad_norm": 21.383140181787045, "learning_rate": 2.3587412926283438e-07, "logits/chosen": -1.6451104879379272, "logits/rejected": -1.5843571424484253, "logps/chosen": -529.2950439453125, "logps/rejected": -677.9429321289062, "loss": 0.5167, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.010671615600586, "rewards/margins": 1.9736932516098022, "rewards/rejected": -3.9843647480010986, "step": 2360 }, { "epoch": 0.5686180422264875, "grad_norm": 11.060278026039455, "learning_rate": 2.337841575030642e-07, "logits/chosen": -1.6365985870361328, "logits/rejected": -1.6005491018295288, "logps/chosen": -461.49749755859375, "logps/rejected": -656.2427978515625, "loss": 0.4837, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6665138006210327, "rewards/margins": 1.8031669855117798, "rewards/rejected": -3.4696803092956543, "step": 2370 }, { "epoch": 0.5710172744721689, "grad_norm": 12.434301470105313, "learning_rate": 2.316953232202206e-07, "logits/chosen": -1.5425972938537598, "logits/rejected": -1.5300174951553345, "logps/chosen": -429.36444091796875, "logps/rejected": -472.57568359375, "loss": 0.4711, "rewards/accuracies": 0.75, "rewards/chosen": -1.7053802013397217, "rewards/margins": 1.1300846338272095, "rewards/rejected": -2.8354649543762207, "step": 2380 }, { "epoch": 0.5734165067178503, "grad_norm": 11.159233566131416, "learning_rate": 2.2960777293772958e-07, "logits/chosen": -1.4974268674850464, "logits/rejected": -1.412022590637207, "logps/chosen": -410.587890625, "logps/rejected": -676.4757690429688, "loss": 0.4907, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8208236694335938, "rewards/margins": 2.8905091285705566, "rewards/rejected": -4.71133279800415, "step": 2390 }, { "epoch": 0.5758157389635317, "grad_norm": 8.941122859434415, "learning_rate": 2.2752165308894974e-07, "logits/chosen": -1.5879501104354858, "logits/rejected": -1.538260579109192, "logps/chosen": -387.39886474609375, "logps/rejected": -541.6805419921875, "loss": 0.4716, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7686420679092407, "rewards/margins": 1.6267540454864502, "rewards/rejected": -3.3953964710235596, "step": 2400 }, { "epoch": 0.5782149712092131, "grad_norm": 12.044861445889701, "learning_rate": 2.254371100069005e-07, "logits/chosen": -1.5681109428405762, "logits/rejected": -1.5912139415740967, "logps/chosen": -399.2314453125, "logps/rejected": -558.3780517578125, "loss": 0.4731, "rewards/accuracies": 0.875, "rewards/chosen": -1.4787284135818481, "rewards/margins": 1.385317325592041, "rewards/rejected": -2.8640456199645996, "step": 2410 }, { "epoch": 0.5806142034548945, "grad_norm": 14.345741676788231, "learning_rate": 2.2335428991399725e-07, "logits/chosen": -1.4331409931182861, "logits/rejected": -1.3503481149673462, "logps/chosen": -473.1424255371094, "logps/rejected": -837.6438598632812, "loss": 0.4977, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.4872748851776123, "rewards/margins": 3.6537222862243652, "rewards/rejected": -6.140997409820557, "step": 2420 }, { "epoch": 0.5830134357005758, "grad_norm": 13.4432658994951, "learning_rate": 2.2127333891179458e-07, "logits/chosen": -1.574540138244629, "logits/rejected": -1.505368709564209, "logps/chosen": -423.1871032714844, "logps/rejected": -710.91845703125, "loss": 0.5206, "rewards/accuracies": 0.75, "rewards/chosen": -1.972616195678711, "rewards/margins": 2.7154417037963867, "rewards/rejected": -4.6880574226379395, "step": 2430 }, { "epoch": 0.5854126679462572, "grad_norm": 11.856242883233412, "learning_rate": 2.1919440297073782e-07, "logits/chosen": -1.5872819423675537, "logits/rejected": -1.4878933429718018, "logps/chosen": -429.8773498535156, "logps/rejected": -675.3303833007812, "loss": 0.5143, "rewards/accuracies": 0.75, "rewards/chosen": -1.9643665552139282, "rewards/margins": 2.4596877098083496, "rewards/rejected": -4.424054145812988, "step": 2440 }, { "epoch": 0.5878119001919386, "grad_norm": 12.96744179023995, "learning_rate": 2.1711762791992368e-07, "logits/chosen": -1.5757300853729248, "logits/rejected": -1.498997449874878, "logps/chosen": -521.7322998046875, "logps/rejected": -615.30712890625, "loss": 0.5374, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.17085337638855, "rewards/margins": 1.284227967262268, "rewards/rejected": -3.4550812244415283, "step": 2450 }, { "epoch": 0.5902111324376199, "grad_norm": 10.542916107819408, "learning_rate": 2.1504315943687114e-07, "logits/chosen": -1.6688730716705322, "logits/rejected": -1.6244138479232788, "logps/chosen": -413.05352783203125, "logps/rejected": -669.3853149414062, "loss": 0.5043, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.667035698890686, "rewards/margins": 2.1492269039154053, "rewards/rejected": -3.8162624835968018, "step": 2460 }, { "epoch": 0.5926103646833013, "grad_norm": 13.515470462935513, "learning_rate": 2.1297114303730248e-07, "logits/chosen": -1.5288382768630981, "logits/rejected": -1.4609134197235107, "logps/chosen": -422.8729553222656, "logps/rejected": -724.0157470703125, "loss": 0.5295, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.968060851097107, "rewards/margins": 2.4587647914886475, "rewards/rejected": -4.426825523376465, "step": 2470 }, { "epoch": 0.5950095969289827, "grad_norm": 14.338489620286177, "learning_rate": 2.1090172406493616e-07, "logits/chosen": -1.5649030208587646, "logits/rejected": -1.5623984336853027, "logps/chosen": -399.79290771484375, "logps/rejected": -583.4332275390625, "loss": 0.4501, "rewards/accuracies": 0.875, "rewards/chosen": -1.5557973384857178, "rewards/margins": 1.7626543045043945, "rewards/rejected": -3.3184516429901123, "step": 2480 }, { "epoch": 0.5974088291746641, "grad_norm": 17.510238929607876, "learning_rate": 2.0883504768129146e-07, "logits/chosen": -1.6265462636947632, "logits/rejected": -1.598730444908142, "logps/chosen": -497.2835998535156, "logps/rejected": -648.0611572265625, "loss": 0.5081, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.131727695465088, "rewards/margins": 1.5764387845993042, "rewards/rejected": -3.7081668376922607, "step": 2490 }, { "epoch": 0.5998080614203455, "grad_norm": 11.33500534140664, "learning_rate": 2.0677125885550571e-07, "logits/chosen": -1.4689067602157593, "logits/rejected": -1.453774094581604, "logps/chosen": -439.34075927734375, "logps/rejected": -537.5209350585938, "loss": 0.4906, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8917917013168335, "rewards/margins": 1.3580577373504639, "rewards/rejected": -3.249849319458008, "step": 2500 }, { "epoch": 0.6022072936660269, "grad_norm": 13.216366762387246, "learning_rate": 2.0471050235416587e-07, "logits/chosen": -1.578136920928955, "logits/rejected": -1.5401719808578491, "logps/chosen": -500.56072998046875, "logps/rejected": -603.341796875, "loss": 0.4649, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.2012569904327393, "rewards/margins": 1.514261245727539, "rewards/rejected": -3.71551775932312, "step": 2510 }, { "epoch": 0.6046065259117083, "grad_norm": 13.494163188740151, "learning_rate": 2.026529227311532e-07, "logits/chosen": -1.5589288473129272, "logits/rejected": -1.4520585536956787, "logps/chosen": -436.919189453125, "logps/rejected": -646.716796875, "loss": 0.5339, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0600430965423584, "rewards/margins": 2.0313327312469482, "rewards/rejected": -4.091375827789307, "step": 2520 }, { "epoch": 0.6070057581573897, "grad_norm": 13.27727238694429, "learning_rate": 2.005986643175036e-07, "logits/chosen": -1.5554237365722656, "logits/rejected": -1.5433388948440552, "logps/chosen": -461.245849609375, "logps/rejected": -686.7439575195312, "loss": 0.4396, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.842258095741272, "rewards/margins": 2.4114012718200684, "rewards/rejected": -4.253658771514893, "step": 2530 }, { "epoch": 0.6094049904030711, "grad_norm": 13.0139085843487, "learning_rate": 1.9854787121128328e-07, "logits/chosen": -1.51650071144104, "logits/rejected": -1.4218528270721436, "logps/chosen": -408.762939453125, "logps/rejected": -523.1329956054688, "loss": 0.5186, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.722747802734375, "rewards/margins": 1.6003834009170532, "rewards/rejected": -3.3231310844421387, "step": 2540 }, { "epoch": 0.6118042226487524, "grad_norm": 10.279174762722011, "learning_rate": 1.9650068726748106e-07, "logits/chosen": -1.4751393795013428, "logits/rejected": -1.4030897617340088, "logps/chosen": -480.7040100097656, "logps/rejected": -683.616943359375, "loss": 0.5435, "rewards/accuracies": 0.75, "rewards/chosen": -1.9335578680038452, "rewards/margins": 2.1118619441986084, "rewards/rejected": -4.045419692993164, "step": 2550 }, { "epoch": 0.6142034548944337, "grad_norm": 10.58914230166321, "learning_rate": 1.9445725608791718e-07, "logits/chosen": -1.4881677627563477, "logits/rejected": -1.339155673980713, "logps/chosen": -489.5857849121094, "logps/rejected": -993.9429931640625, "loss": 0.4928, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1533727645874023, "rewards/margins": 4.9657697677612305, "rewards/rejected": -7.119143009185791, "step": 2560 }, { "epoch": 0.6166026871401151, "grad_norm": 10.044783509073032, "learning_rate": 1.924177210111705e-07, "logits/chosen": -1.5943437814712524, "logits/rejected": -1.4688080549240112, "logps/chosen": -434.68804931640625, "logps/rejected": -752.3925170898438, "loss": 0.4961, "rewards/accuracies": 0.75, "rewards/chosen": -1.8741295337677002, "rewards/margins": 3.1029107570648193, "rewards/rejected": -4.977039813995361, "step": 2570 }, { "epoch": 0.6190019193857965, "grad_norm": 10.803603559596482, "learning_rate": 1.9038222510252364e-07, "logits/chosen": -1.63632071018219, "logits/rejected": -1.5772790908813477, "logps/chosen": -425.29705810546875, "logps/rejected": -547.4097900390625, "loss": 0.4988, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6209042072296143, "rewards/margins": 1.3503049612045288, "rewards/rejected": -2.9712090492248535, "step": 2580 }, { "epoch": 0.6214011516314779, "grad_norm": 13.398493396695924, "learning_rate": 1.883509111439277e-07, "logits/chosen": -1.5346853733062744, "logits/rejected": -1.4362837076187134, "logps/chosen": -431.0760192871094, "logps/rejected": -812.218017578125, "loss": 0.5172, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9112809896469116, "rewards/margins": 3.1443681716918945, "rewards/rejected": -5.055649757385254, "step": 2590 }, { "epoch": 0.6238003838771593, "grad_norm": 8.974066504769196, "learning_rate": 1.8632392162398665e-07, "logits/chosen": -1.6221952438354492, "logits/rejected": -1.5492918491363525, "logps/chosen": -503.0997009277344, "logps/rejected": -713.653564453125, "loss": 0.4848, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.9071382284164429, "rewards/margins": 2.196667432785034, "rewards/rejected": -4.1038055419921875, "step": 2600 }, { "epoch": 0.6261996161228407, "grad_norm": 10.465480641447618, "learning_rate": 1.84301398727962e-07, "logits/chosen": -1.463478922843933, "logits/rejected": -1.373296856880188, "logps/chosen": -358.58990478515625, "logps/rejected": -702.1085205078125, "loss": 0.5041, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6315553188323975, "rewards/margins": 3.0978779792785645, "rewards/rejected": -4.729433536529541, "step": 2610 }, { "epoch": 0.6285988483685221, "grad_norm": 16.447249304360692, "learning_rate": 1.8228348432779966e-07, "logits/chosen": -1.601949691772461, "logits/rejected": -1.5340659618377686, "logps/chosen": -446.01934814453125, "logps/rejected": -602.1727294921875, "loss": 0.5133, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0065500736236572, "rewards/margins": 1.6265771389007568, "rewards/rejected": -3.633127212524414, "step": 2620 }, { "epoch": 0.6309980806142035, "grad_norm": 9.975684368561806, "learning_rate": 1.8027031997217773e-07, "logits/chosen": -1.496711015701294, "logits/rejected": -1.3451837301254272, "logps/chosen": -453.6783752441406, "logps/rejected": -909.4461059570312, "loss": 0.4633, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.1523375511169434, "rewards/margins": 4.429135799407959, "rewards/rejected": -6.581473350524902, "step": 2630 }, { "epoch": 0.6333973128598849, "grad_norm": 9.551241278071267, "learning_rate": 1.7826204687657758e-07, "logits/chosen": -1.5323810577392578, "logits/rejected": -1.4816257953643799, "logps/chosen": -476.60723876953125, "logps/rejected": -552.1718139648438, "loss": 0.4779, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8675925731658936, "rewards/margins": 1.1629068851470947, "rewards/rejected": -3.0304996967315674, "step": 2640 }, { "epoch": 0.6357965451055663, "grad_norm": 11.460448418061953, "learning_rate": 1.762588059133781e-07, "logits/chosen": -1.5307328701019287, "logits/rejected": -1.4167407751083374, "logps/chosen": -492.13885498046875, "logps/rejected": -644.9505004882812, "loss": 0.4905, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7892935276031494, "rewards/margins": 1.8879598379135132, "rewards/rejected": -3.6772537231445312, "step": 2650 }, { "epoch": 0.6381957773512476, "grad_norm": 18.293188421454047, "learning_rate": 1.7426073760197406e-07, "logits/chosen": -1.6222972869873047, "logits/rejected": -1.5244884490966797, "logps/chosen": -476.624267578125, "logps/rejected": -844.6385498046875, "loss": 0.5186, "rewards/accuracies": 0.875, "rewards/chosen": -2.1062722206115723, "rewards/margins": 3.3689582347869873, "rewards/rejected": -5.4752302169799805, "step": 2660 }, { "epoch": 0.6405950095969289, "grad_norm": 9.733158102478546, "learning_rate": 1.7226798209891935e-07, "logits/chosen": -1.5991920232772827, "logits/rejected": -1.5383590459823608, "logps/chosen": -447.33721923828125, "logps/rejected": -570.9419555664062, "loss": 0.4602, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.72260320186615, "rewards/margins": 1.8290891647338867, "rewards/rejected": -3.551692247390747, "step": 2670 }, { "epoch": 0.6429942418426103, "grad_norm": 13.946180043271863, "learning_rate": 1.7028067918809535e-07, "logits/chosen": -1.5528197288513184, "logits/rejected": -1.3879868984222412, "logps/chosen": -410.92718505859375, "logps/rejected": -823.4285888671875, "loss": 0.4851, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.739851713180542, "rewards/margins": 3.7594234943389893, "rewards/rejected": -5.499274730682373, "step": 2680 }, { "epoch": 0.6453934740882917, "grad_norm": 10.290377075371278, "learning_rate": 1.6829896827090584e-07, "logits/chosen": -1.6512863636016846, "logits/rejected": -1.6360222101211548, "logps/chosen": -465.19073486328125, "logps/rejected": -523.7138061523438, "loss": 0.554, "rewards/accuracies": 0.625, "rewards/chosen": -1.8953874111175537, "rewards/margins": 0.9017803072929382, "rewards/rejected": -2.797168254852295, "step": 2690 }, { "epoch": 0.6477927063339731, "grad_norm": 8.491674840989798, "learning_rate": 1.6632298835649844e-07, "logits/chosen": -1.6026594638824463, "logits/rejected": -1.5701754093170166, "logps/chosen": -467.7550354003906, "logps/rejected": -753.0574951171875, "loss": 0.4577, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8194019794464111, "rewards/margins": 2.601040840148926, "rewards/rejected": -4.420442581176758, "step": 2700 }, { "epoch": 0.6501919385796545, "grad_norm": 31.306228191814924, "learning_rate": 1.6435287805201364e-07, "logits/chosen": -1.5099729299545288, "logits/rejected": -1.4581714868545532, "logps/chosen": -468.90753173828125, "logps/rejected": -623.4779663085938, "loss": 0.515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9226620197296143, "rewards/margins": 1.5821655988693237, "rewards/rejected": -3.5048279762268066, "step": 2710 }, { "epoch": 0.6525911708253359, "grad_norm": 11.64208369065543, "learning_rate": 1.6238877555286207e-07, "logits/chosen": -1.6213748455047607, "logits/rejected": -1.5881431102752686, "logps/chosen": -452.04132080078125, "logps/rejected": -652.6304931640625, "loss": 0.463, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6790266036987305, "rewards/margins": 1.9346497058868408, "rewards/rejected": -3.6136765480041504, "step": 2720 }, { "epoch": 0.6549904030710173, "grad_norm": 13.434697867289191, "learning_rate": 1.60430818633031e-07, "logits/chosen": -1.717829704284668, "logits/rejected": -1.6711931228637695, "logps/chosen": -415.7367248535156, "logps/rejected": -573.2308959960938, "loss": 0.4395, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4689064025878906, "rewards/margins": 1.6664180755615234, "rewards/rejected": -3.135324239730835, "step": 2730 }, { "epoch": 0.6573896353166987, "grad_norm": 9.195466035459773, "learning_rate": 1.5847914463541939e-07, "logits/chosen": -1.4839345216751099, "logits/rejected": -1.4070708751678467, "logps/chosen": -408.98046875, "logps/rejected": -607.1571044921875, "loss": 0.4778, "rewards/accuracies": 0.75, "rewards/chosen": -1.960526704788208, "rewards/margins": 1.7776432037353516, "rewards/rejected": -3.7381699085235596, "step": 2740 }, { "epoch": 0.6597888675623801, "grad_norm": 8.168397030598346, "learning_rate": 1.5653389046220427e-07, "logits/chosen": -1.5618484020233154, "logits/rejected": -1.5674700736999512, "logps/chosen": -410.63739013671875, "logps/rejected": -545.4283447265625, "loss": 0.5017, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5986849069595337, "rewards/margins": 1.2204779386520386, "rewards/rejected": -2.8191628456115723, "step": 2750 }, { "epoch": 0.6621880998080614, "grad_norm": 15.795620020388752, "learning_rate": 1.545951925652375e-07, "logits/chosen": -1.574741244316101, "logits/rejected": -1.5150604248046875, "logps/chosen": -502.17840576171875, "logps/rejected": -597.1779174804688, "loss": 0.4903, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.739625334739685, "rewards/margins": 1.5817149877548218, "rewards/rejected": -3.3213400840759277, "step": 2760 }, { "epoch": 0.6645873320537428, "grad_norm": 12.564082860552478, "learning_rate": 1.5266318693647423e-07, "logits/chosen": -1.604776382446289, "logits/rejected": -1.5733304023742676, "logps/chosen": -460.91168212890625, "logps/rejected": -548.9085083007812, "loss": 0.4572, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.717616081237793, "rewards/margins": 0.9698230624198914, "rewards/rejected": -2.68743896484375, "step": 2770 }, { "epoch": 0.6669865642994242, "grad_norm": 12.201358595051783, "learning_rate": 1.5073800909843353e-07, "logits/chosen": -1.6035501956939697, "logits/rejected": -1.4797896146774292, "logps/chosen": -501.05548095703125, "logps/rejected": -667.8190307617188, "loss": 0.4558, "rewards/accuracies": 0.75, "rewards/chosen": -2.150601863861084, "rewards/margins": 2.3036932945251465, "rewards/rejected": -4.4542951583862305, "step": 2780 }, { "epoch": 0.6693857965451055, "grad_norm": 10.795967175298328, "learning_rate": 1.488197940946922e-07, "logits/chosen": -1.5843151807785034, "logits/rejected": -1.5326802730560303, "logps/chosen": -501.5411682128906, "logps/rejected": -611.32568359375, "loss": 0.4764, "rewards/accuracies": 0.75, "rewards/chosen": -2.1024060249328613, "rewards/margins": 1.6799418926239014, "rewards/rejected": -3.7823474407196045, "step": 2790 }, { "epoch": 0.6717850287907869, "grad_norm": 28.011002170203856, "learning_rate": 1.4690867648041167e-07, "logits/chosen": -1.5918710231781006, "logits/rejected": -1.4779281616210938, "logps/chosen": -473.7042541503906, "logps/rejected": -662.2276000976562, "loss": 0.5129, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0022711753845215, "rewards/margins": 2.1550307273864746, "rewards/rejected": -4.157301902770996, "step": 2800 }, { "epoch": 0.6741842610364683, "grad_norm": 12.528624583800916, "learning_rate": 1.4500479031289987e-07, "logits/chosen": -1.6873347759246826, "logits/rejected": -1.596543312072754, "logps/chosen": -460.88909912109375, "logps/rejected": -586.3327026367188, "loss": 0.5198, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7791045904159546, "rewards/margins": 1.3285664319992065, "rewards/rejected": -3.1076712608337402, "step": 2810 }, { "epoch": 0.6765834932821497, "grad_norm": 11.065013602236313, "learning_rate": 1.4310826914220747e-07, "logits/chosen": -1.7246005535125732, "logits/rejected": -1.7012087106704712, "logps/chosen": -493.4828186035156, "logps/rejected": -576.1470336914062, "loss": 0.5056, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8482357263565063, "rewards/margins": 0.951060950756073, "rewards/rejected": -2.7992968559265137, "step": 2820 }, { "epoch": 0.6789827255278311, "grad_norm": 11.565962251192635, "learning_rate": 1.412192460017597e-07, "logits/chosen": -1.5370794534683228, "logits/rejected": -1.459987759590149, "logps/chosen": -463.569580078125, "logps/rejected": -711.2032470703125, "loss": 0.5085, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.087894916534424, "rewards/margins": 2.450357675552368, "rewards/rejected": -4.538252353668213, "step": 2830 }, { "epoch": 0.6813819577735125, "grad_norm": 9.118437836656124, "learning_rate": 1.3933785339902504e-07, "logits/chosen": -1.5175681114196777, "logits/rejected": -1.5240638256072998, "logps/chosen": -382.7988586425781, "logps/rejected": -606.9676513671875, "loss": 0.5099, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6014810800552368, "rewards/margins": 1.8436975479125977, "rewards/rejected": -3.445178508758545, "step": 2840 }, { "epoch": 0.6837811900191939, "grad_norm": 8.445711443707477, "learning_rate": 1.374642233062197e-07, "logits/chosen": -1.6221126317977905, "logits/rejected": -1.5964024066925049, "logps/chosen": -470.7940368652344, "logps/rejected": -608.9495239257812, "loss": 0.5176, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7363065481185913, "rewards/margins": 1.7359542846679688, "rewards/rejected": -3.4722609519958496, "step": 2850 }, { "epoch": 0.6861804222648752, "grad_norm": 8.500425735666235, "learning_rate": 1.355984871510511e-07, "logits/chosen": -1.5512058734893799, "logits/rejected": -1.5746601819992065, "logps/chosen": -498.8207092285156, "logps/rejected": -657.9988403320312, "loss": 0.4596, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.9174222946166992, "rewards/margins": 1.5131709575653076, "rewards/rejected": -3.430593490600586, "step": 2860 }, { "epoch": 0.6885796545105566, "grad_norm": 12.852665923928024, "learning_rate": 1.3374077580749783e-07, "logits/chosen": -1.6059240102767944, "logits/rejected": -1.5994579792022705, "logps/chosen": -344.4139404296875, "logps/rejected": -512.6463623046875, "loss": 0.4934, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4501264095306396, "rewards/margins": 1.4970743656158447, "rewards/rejected": -2.9472010135650635, "step": 2870 }, { "epoch": 0.690978886756238, "grad_norm": 17.194251207513553, "learning_rate": 1.3189121958663024e-07, "logits/chosen": -1.643771767616272, "logits/rejected": -1.5831727981567383, "logps/chosen": -502.8651428222656, "logps/rejected": -554.9192504882812, "loss": 0.493, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0862865447998047, "rewards/margins": 0.8170648813247681, "rewards/rejected": -2.903351306915283, "step": 2880 }, { "epoch": 0.6933781190019194, "grad_norm": 12.906100640591236, "learning_rate": 1.3004994822746895e-07, "logits/chosen": -1.7391399145126343, "logits/rejected": -1.6716740131378174, "logps/chosen": -431.04180908203125, "logps/rejected": -561.5888671875, "loss": 0.5147, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6912224292755127, "rewards/margins": 1.2724682092666626, "rewards/rejected": -2.9636902809143066, "step": 2890 }, { "epoch": 0.6957773512476008, "grad_norm": 9.565619443461362, "learning_rate": 1.2821709088788434e-07, "logits/chosen": -1.5313981771469116, "logits/rejected": -1.5281095504760742, "logps/chosen": -398.6699523925781, "logps/rejected": -540.1404418945312, "loss": 0.5184, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8083369731903076, "rewards/margins": 1.3905736207962036, "rewards/rejected": -3.198910713195801, "step": 2900 }, { "epoch": 0.6981765834932822, "grad_norm": 15.033651183019835, "learning_rate": 1.2639277613553736e-07, "logits/chosen": -1.469612717628479, "logits/rejected": -1.414353609085083, "logps/chosen": -363.5370178222656, "logps/rejected": -500.765380859375, "loss": 0.486, "rewards/accuracies": 0.75, "rewards/chosen": -1.468360185623169, "rewards/margins": 1.3389683961868286, "rewards/rejected": -2.807328701019287, "step": 2910 }, { "epoch": 0.7005758157389635, "grad_norm": 9.435304971614766, "learning_rate": 1.2457713193885975e-07, "logits/chosen": -1.461320161819458, "logits/rejected": -1.4299747943878174, "logps/chosen": -376.7994079589844, "logps/rejected": -629.6224975585938, "loss": 0.4765, "rewards/accuracies": 0.75, "rewards/chosen": -1.896950364112854, "rewards/margins": 2.1872236728668213, "rewards/rejected": -4.084174156188965, "step": 2920 }, { "epoch": 0.7029750479846449, "grad_norm": 19.51031494795868, "learning_rate": 1.2277028565807838e-07, "logits/chosen": -1.6188557147979736, "logits/rejected": -1.5686419010162354, "logps/chosen": -430.37677001953125, "logps/rejected": -559.7628173828125, "loss": 0.4852, "rewards/accuracies": 0.75, "rewards/chosen": -1.6697213649749756, "rewards/margins": 1.4008890390396118, "rewards/rejected": -3.070610523223877, "step": 2930 }, { "epoch": 0.7053742802303263, "grad_norm": 9.121829865634552, "learning_rate": 1.209723640362815e-07, "logits/chosen": -1.6373344659805298, "logits/rejected": -1.5528385639190674, "logps/chosen": -470.95147705078125, "logps/rejected": -721.1168212890625, "loss": 0.5449, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9253082275390625, "rewards/margins": 2.59212327003479, "rewards/rejected": -4.517431259155273, "step": 2940 }, { "epoch": 0.7077735124760077, "grad_norm": 10.746753751922695, "learning_rate": 1.191834931905277e-07, "logits/chosen": -1.6567723751068115, "logits/rejected": -1.6191829442977905, "logps/chosen": -511.308349609375, "logps/rejected": -676.2968139648438, "loss": 0.5049, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9652636051177979, "rewards/margins": 1.6685765981674194, "rewards/rejected": -3.6338400840759277, "step": 2950 }, { "epoch": 0.710172744721689, "grad_norm": 9.111246533138369, "learning_rate": 1.1740379860299988e-07, "logits/chosen": -1.605023741722107, "logits/rejected": -1.6224443912506104, "logps/chosen": -493.97021484375, "logps/rejected": -663.3172607421875, "loss": 0.517, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9065659046173096, "rewards/margins": 1.4981725215911865, "rewards/rejected": -3.404738664627075, "step": 2960 }, { "epoch": 0.7125719769673704, "grad_norm": 19.195509902532088, "learning_rate": 1.1563340511220254e-07, "logits/chosen": -1.5975620746612549, "logits/rejected": -1.5121543407440186, "logps/chosen": -490.51123046875, "logps/rejected": -638.4617309570312, "loss": 0.5129, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8694756031036377, "rewards/margins": 1.672284722328186, "rewards/rejected": -3.541760206222534, "step": 2970 }, { "epoch": 0.7149712092130518, "grad_norm": 7.617751594360228, "learning_rate": 1.1387243690420556e-07, "logits/chosen": -1.6257362365722656, "logits/rejected": -1.556630253791809, "logps/chosen": -514.2935180664062, "logps/rejected": -719.3594360351562, "loss": 0.5041, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7584432363510132, "rewards/margins": 2.219550371170044, "rewards/rejected": -3.9779934883117676, "step": 2980 }, { "epoch": 0.7173704414587332, "grad_norm": 13.715441326355377, "learning_rate": 1.1212101750393235e-07, "logits/chosen": -1.5823287963867188, "logits/rejected": -1.4760550260543823, "logps/chosen": -454.1546936035156, "logps/rejected": -674.0308837890625, "loss": 0.4824, "rewards/accuracies": 0.875, "rewards/chosen": -1.85116446018219, "rewards/margins": 2.426201581954956, "rewards/rejected": -4.277366638183594, "step": 2990 }, { "epoch": 0.7197696737044146, "grad_norm": 11.870152015817894, "learning_rate": 1.1037926976649562e-07, "logits/chosen": -1.6472456455230713, "logits/rejected": -1.5713512897491455, "logps/chosen": -454.6564025878906, "logps/rejected": -658.142578125, "loss": 0.5097, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7928310632705688, "rewards/margins": 1.8045374155044556, "rewards/rejected": -3.5973682403564453, "step": 3000 }, { "epoch": 0.722168905950096, "grad_norm": 12.133762665442388, "learning_rate": 1.0864731586857936e-07, "logits/chosen": -1.5800144672393799, "logits/rejected": -1.4943416118621826, "logps/chosen": -462.8450622558594, "logps/rejected": -622.0472412109375, "loss": 0.4795, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.633138656616211, "rewards/margins": 1.8899364471435547, "rewards/rejected": -3.5230751037597656, "step": 3010 }, { "epoch": 0.7245681381957774, "grad_norm": 15.184681556970155, "learning_rate": 1.0692527729986839e-07, "logits/chosen": -1.6487934589385986, "logits/rejected": -1.5611571073532104, "logps/chosen": -443.3125, "logps/rejected": -629.9989624023438, "loss": 0.4486, "rewards/accuracies": 0.75, "rewards/chosen": -1.7226934432983398, "rewards/margins": 2.09248948097229, "rewards/rejected": -3.81518292427063, "step": 3020 }, { "epoch": 0.7269673704414588, "grad_norm": 8.881606654301349, "learning_rate": 1.0521327485452692e-07, "logits/chosen": -1.50355064868927, "logits/rejected": -1.4316844940185547, "logps/chosen": -454.41375732421875, "logps/rejected": -705.1302490234375, "loss": 0.4685, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0102181434631348, "rewards/margins": 2.672572612762451, "rewards/rejected": -4.682791233062744, "step": 3030 }, { "epoch": 0.7293666026871402, "grad_norm": 12.06878430856091, "learning_rate": 1.0351142862272468e-07, "logits/chosen": -1.5602006912231445, "logits/rejected": -1.397483229637146, "logps/chosen": -426.56011962890625, "logps/rejected": -804.3447265625, "loss": 0.488, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9955031871795654, "rewards/margins": 3.9034297466278076, "rewards/rejected": -5.898933410644531, "step": 3040 }, { "epoch": 0.7317658349328215, "grad_norm": 12.298414208256554, "learning_rate": 1.0181985798221343e-07, "logits/chosen": -1.5111182928085327, "logits/rejected": -1.4830925464630127, "logps/chosen": -453.55706787109375, "logps/rejected": -682.9276123046875, "loss": 0.4944, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8569743633270264, "rewards/margins": 2.276150703430176, "rewards/rejected": -4.133125305175781, "step": 3050 }, { "epoch": 0.7341650671785028, "grad_norm": 11.302209741727303, "learning_rate": 1.0013868158995329e-07, "logits/chosen": -1.479041337966919, "logits/rejected": -1.4589459896087646, "logps/chosen": -488.8414001464844, "logps/rejected": -664.2630615234375, "loss": 0.4896, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.093553066253662, "rewards/margins": 1.9455658197402954, "rewards/rejected": -4.039118766784668, "step": 3060 }, { "epoch": 0.7365642994241842, "grad_norm": 9.425382101973028, "learning_rate": 9.84680173737887e-08, "logits/chosen": -1.6392253637313843, "logits/rejected": -1.6131207942962646, "logps/chosen": -453.7015686035156, "logps/rejected": -568.0528564453125, "loss": 0.505, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7246614694595337, "rewards/margins": 1.5568969249725342, "rewards/rejected": -3.2815582752227783, "step": 3070 }, { "epoch": 0.7389635316698656, "grad_norm": 10.214651228125971, "learning_rate": 9.680798252417713e-08, "logits/chosen": -1.5981972217559814, "logits/rejected": -1.5456653833389282, "logps/chosen": -404.54217529296875, "logps/rejected": -615.52490234375, "loss": 0.4716, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8212268352508545, "rewards/margins": 1.7736434936523438, "rewards/rejected": -3.5948708057403564, "step": 3080 }, { "epoch": 0.741362763915547, "grad_norm": 9.96347940855632, "learning_rate": 9.515869348596808e-08, "logits/chosen": -1.7033554315567017, "logits/rejected": -1.689077615737915, "logps/chosen": -479.61602783203125, "logps/rejected": -600.86083984375, "loss": 0.4722, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7726186513900757, "rewards/margins": 1.4355189800262451, "rewards/rejected": -3.2081375122070312, "step": 3090 }, { "epoch": 0.7437619961612284, "grad_norm": 9.182568453922713, "learning_rate": 9.352026595023493e-08, "logits/chosen": -1.7283750772476196, "logits/rejected": -1.651894211769104, "logps/chosen": -472.06890869140625, "logps/rejected": -567.2432250976562, "loss": 0.4948, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7672761678695679, "rewards/margins": 1.1923803091049194, "rewards/rejected": -2.9596564769744873, "step": 3100 }, { "epoch": 0.7461612284069098, "grad_norm": 13.775785773649671, "learning_rate": 9.189281484616004e-08, "logits/chosen": -1.5656628608703613, "logits/rejected": -1.5381602048873901, "logps/chosen": -405.3626403808594, "logps/rejected": -634.8448486328125, "loss": 0.5248, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9110819101333618, "rewards/margins": 1.8859049081802368, "rewards/rejected": -3.7969868183135986, "step": 3110 }, { "epoch": 0.7485604606525912, "grad_norm": 18.73925450329653, "learning_rate": 9.027645433297249e-08, "logits/chosen": -1.5921382904052734, "logits/rejected": -1.3741363286972046, "logps/chosen": -594.7786865234375, "logps/rejected": -781.3391723632812, "loss": 0.5287, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.647385597229004, "rewards/margins": 2.289605140686035, "rewards/rejected": -4.936990737915039, "step": 3120 }, { "epoch": 0.7509596928982726, "grad_norm": 24.661577248495657, "learning_rate": 8.867129779194066e-08, "logits/chosen": -1.5586223602294922, "logits/rejected": -1.4272655248641968, "logps/chosen": -389.38580322265625, "logps/rejected": -706.0331420898438, "loss": 0.5109, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.627242088317871, "rewards/margins": 3.178426742553711, "rewards/rejected": -4.805668830871582, "step": 3130 }, { "epoch": 0.753358925143954, "grad_norm": 13.393978248198778, "learning_rate": 8.707745781841866e-08, "logits/chosen": -1.5590431690216064, "logits/rejected": -1.4452258348464966, "logps/chosen": -470.12890625, "logps/rejected": -711.6036376953125, "loss": 0.5168, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.2957184314727783, "rewards/margins": 2.5038769245147705, "rewards/rejected": -4.799595355987549, "step": 3140 }, { "epoch": 0.7557581573896354, "grad_norm": 15.326596868212043, "learning_rate": 8.549504621394831e-08, "logits/chosen": -1.6833692789077759, "logits/rejected": -1.5196136236190796, "logps/chosen": -434.9501953125, "logps/rejected": -797.6722412109375, "loss": 0.4405, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8721044063568115, "rewards/margins": 3.692800998687744, "rewards/rejected": -5.564905166625977, "step": 3150 }, { "epoch": 0.7581573896353166, "grad_norm": 15.888863470601404, "learning_rate": 8.392417397841703e-08, "logits/chosen": -1.576700210571289, "logits/rejected": -1.5513032674789429, "logps/chosen": -457.85125732421875, "logps/rejected": -624.4636840820312, "loss": 0.4774, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7959423065185547, "rewards/margins": 1.5467565059661865, "rewards/rejected": -3.3426990509033203, "step": 3160 }, { "epoch": 0.760556621880998, "grad_norm": 80.57455805524923, "learning_rate": 8.236495130227083e-08, "logits/chosen": -1.5804119110107422, "logits/rejected": -1.4564083814620972, "logps/chosen": -529.6447143554688, "logps/rejected": -735.8713989257812, "loss": 0.5082, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.244884967803955, "rewards/margins": 2.5058138370513916, "rewards/rejected": -4.750698566436768, "step": 3170 }, { "epoch": 0.7629558541266794, "grad_norm": 11.057339252820107, "learning_rate": 8.081748755878612e-08, "logits/chosen": -1.6222680807113647, "logits/rejected": -1.5655839443206787, "logps/chosen": -490.6512756347656, "logps/rejected": -579.689208984375, "loss": 0.494, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9746372699737549, "rewards/margins": 1.4921201467514038, "rewards/rejected": -3.466757297515869, "step": 3180 }, { "epoch": 0.7653550863723608, "grad_norm": 10.618176488071374, "learning_rate": 7.928189129639632e-08, "logits/chosen": -1.5830059051513672, "logits/rejected": -1.5720051527023315, "logps/chosen": -406.87188720703125, "logps/rejected": -586.8067016601562, "loss": 0.4807, "rewards/accuracies": 0.875, "rewards/chosen": -1.686078429222107, "rewards/margins": 1.659711241722107, "rewards/rejected": -3.345789670944214, "step": 3190 }, { "epoch": 0.7677543186180422, "grad_norm": 13.680452637120762, "learning_rate": 7.775827023107834e-08, "logits/chosen": -1.6163785457611084, "logits/rejected": -1.5762126445770264, "logps/chosen": -446.047607421875, "logps/rejected": -623.6538696289062, "loss": 0.5153, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9947631359100342, "rewards/margins": 1.5673038959503174, "rewards/rejected": -3.5620665550231934, "step": 3200 }, { "epoch": 0.7701535508637236, "grad_norm": 14.433630119675083, "learning_rate": 7.624673123879682e-08, "logits/chosen": -1.7464625835418701, "logits/rejected": -1.6586641073226929, "logps/chosen": -430.14776611328125, "logps/rejected": -545.7342529296875, "loss": 0.5204, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7657415866851807, "rewards/margins": 1.3537575006484985, "rewards/rejected": -3.1194987297058105, "step": 3210 }, { "epoch": 0.772552783109405, "grad_norm": 12.66091896209343, "learning_rate": 7.474738034800663e-08, "logits/chosen": -1.5930930376052856, "logits/rejected": -1.4509809017181396, "logps/chosen": -376.0271301269531, "logps/rejected": -682.3421630859375, "loss": 0.5118, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4719572067260742, "rewards/margins": 3.3040995597839355, "rewards/rejected": -4.776057243347168, "step": 3220 }, { "epoch": 0.7749520153550864, "grad_norm": 10.980304828194955, "learning_rate": 7.326032273221606e-08, "logits/chosen": -1.6833438873291016, "logits/rejected": -1.622230887413025, "logps/chosen": -491.75860595703125, "logps/rejected": -625.6595458984375, "loss": 0.4872, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9277355670928955, "rewards/margins": 1.5761892795562744, "rewards/rejected": -3.503924608230591, "step": 3230 }, { "epoch": 0.7773512476007678, "grad_norm": 10.749977033774096, "learning_rate": 7.178566270260872e-08, "logits/chosen": -1.5899261236190796, "logits/rejected": -1.5829485654830933, "logps/chosen": -477.94805908203125, "logps/rejected": -673.7672729492188, "loss": 0.5126, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0385262966156006, "rewards/margins": 1.790924072265625, "rewards/rejected": -3.8294501304626465, "step": 3240 }, { "epoch": 0.7797504798464492, "grad_norm": 8.484431263168627, "learning_rate": 7.032350370072709e-08, "logits/chosen": -1.6738132238388062, "logits/rejected": -1.6105749607086182, "logps/chosen": -458.83331298828125, "logps/rejected": -622.7803955078125, "loss": 0.4562, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.730473518371582, "rewards/margins": 1.7545102834701538, "rewards/rejected": -3.4849839210510254, "step": 3250 }, { "epoch": 0.7821497120921305, "grad_norm": 9.387892269267574, "learning_rate": 6.887394829121596e-08, "logits/chosen": -1.5848913192749023, "logits/rejected": -1.4476993083953857, "logps/chosen": -471.938720703125, "logps/rejected": -831.1727294921875, "loss": 0.4836, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8803211450576782, "rewards/margins": 3.8900272846221924, "rewards/rejected": -5.77034854888916, "step": 3260 }, { "epoch": 0.7845489443378119, "grad_norm": 18.641825226968827, "learning_rate": 6.743709815462833e-08, "logits/chosen": -1.6654167175292969, "logits/rejected": -1.5044206380844116, "logps/chosen": -483.08917236328125, "logps/rejected": -693.2271728515625, "loss": 0.4752, "rewards/accuracies": 0.75, "rewards/chosen": -2.000235080718994, "rewards/margins": 2.46742582321167, "rewards/rejected": -4.467661380767822, "step": 3270 }, { "epoch": 0.7869481765834933, "grad_norm": 12.349301851226299, "learning_rate": 6.601305408029287e-08, "logits/chosen": -1.5291095972061157, "logits/rejected": -1.419311285018921, "logps/chosen": -436.09454345703125, "logps/rejected": -668.6248779296875, "loss": 0.4694, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8284947872161865, "rewards/margins": 2.354825496673584, "rewards/rejected": -4.18332052230835, "step": 3280 }, { "epoch": 0.7893474088291746, "grad_norm": 9.060603124592086, "learning_rate": 6.460191595924366e-08, "logits/chosen": -1.6744062900543213, "logits/rejected": -1.6133877038955688, "logps/chosen": -455.89495849609375, "logps/rejected": -635.6976318359375, "loss": 0.4573, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.951505422592163, "rewards/margins": 1.7754148244857788, "rewards/rejected": -3.7269206047058105, "step": 3290 }, { "epoch": 0.791746641074856, "grad_norm": 12.585511956386624, "learning_rate": 6.320378277721342e-08, "logits/chosen": -1.598135232925415, "logits/rejected": -1.6028814315795898, "logps/chosen": -483.92535400390625, "logps/rejected": -569.0828247070312, "loss": 0.4735, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1339850425720215, "rewards/margins": 1.0029327869415283, "rewards/rejected": -3.136918067932129, "step": 3300 }, { "epoch": 0.7941458733205374, "grad_norm": 13.389579263658327, "learning_rate": 6.181875260769032e-08, "logits/chosen": -1.640591025352478, "logits/rejected": -1.5519497394561768, "logps/chosen": -507.2039489746094, "logps/rejected": -642.580322265625, "loss": 0.4724, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0453391075134277, "rewards/margins": 2.058948040008545, "rewards/rejected": -4.104287147521973, "step": 3310 }, { "epoch": 0.7965451055662188, "grad_norm": 13.291226613775438, "learning_rate": 6.044692260503797e-08, "logits/chosen": -1.569854736328125, "logits/rejected": -1.5137242078781128, "logps/chosen": -525.5565185546875, "logps/rejected": -748.2379150390625, "loss": 0.443, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.161888360977173, "rewards/margins": 2.5372633934020996, "rewards/rejected": -4.699151515960693, "step": 3320 }, { "epoch": 0.7989443378119002, "grad_norm": 11.742394321174341, "learning_rate": 5.9088388997680984e-08, "logits/chosen": -1.680760383605957, "logits/rejected": -1.5625580549240112, "logps/chosen": -540.314453125, "logps/rejected": -675.5418090820312, "loss": 0.4639, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9422404766082764, "rewards/margins": 2.1821341514587402, "rewards/rejected": -4.1243743896484375, "step": 3330 }, { "epoch": 0.8013435700575816, "grad_norm": 10.388328753668427, "learning_rate": 5.774324708135439e-08, "logits/chosen": -1.631049394607544, "logits/rejected": -1.5399057865142822, "logps/chosen": -395.74127197265625, "logps/rejected": -528.3821411132812, "loss": 0.4906, "rewards/accuracies": 0.75, "rewards/chosen": -1.6879892349243164, "rewards/margins": 1.5684130191802979, "rewards/rejected": -3.2564022541046143, "step": 3340 }, { "epoch": 0.803742802303263, "grad_norm": 8.566813415550278, "learning_rate": 5.641159121241953e-08, "logits/chosen": -1.5318087339401245, "logits/rejected": -1.3939533233642578, "logps/chosen": -455.18475341796875, "logps/rejected": -784.6776123046875, "loss": 0.4742, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1146061420440674, "rewards/margins": 3.002748966217041, "rewards/rejected": -5.1173553466796875, "step": 3350 }, { "epoch": 0.8061420345489443, "grad_norm": 24.581271633624063, "learning_rate": 5.5093514801245106e-08, "logits/chosen": -1.572483777999878, "logits/rejected": -1.5058854818344116, "logps/chosen": -462.61798095703125, "logps/rejected": -692.5750732421875, "loss": 0.5167, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.017944812774658, "rewards/margins": 2.093128204345703, "rewards/rejected": -4.111073017120361, "step": 3360 }, { "epoch": 0.8085412667946257, "grad_norm": 11.15477109986071, "learning_rate": 5.378911030565453e-08, "logits/chosen": -1.6517555713653564, "logits/rejected": -1.6141020059585571, "logps/chosen": -524.3145751953125, "logps/rejected": -706.8885498046875, "loss": 0.4856, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.254424571990967, "rewards/margins": 1.5656322240829468, "rewards/rejected": -3.820056438446045, "step": 3370 }, { "epoch": 0.8109404990403071, "grad_norm": 14.053000697829326, "learning_rate": 5.249846922444101e-08, "logits/chosen": -1.565073013305664, "logits/rejected": -1.3497178554534912, "logps/chosen": -454.67254638671875, "logps/rejected": -954.7060546875, "loss": 0.4615, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.2211482524871826, "rewards/margins": 5.220716953277588, "rewards/rejected": -7.441864967346191, "step": 3380 }, { "epoch": 0.8133397312859885, "grad_norm": 14.873913908271929, "learning_rate": 5.122168209094865e-08, "logits/chosen": -1.5818357467651367, "logits/rejected": -1.6091349124908447, "logps/chosen": -396.847412109375, "logps/rejected": -471.558837890625, "loss": 0.4921, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7213027477264404, "rewards/margins": 0.7357383966445923, "rewards/rejected": -2.457041025161743, "step": 3390 }, { "epoch": 0.8157389635316699, "grad_norm": 7.662127553049077, "learning_rate": 4.995883846672222e-08, "logits/chosen": -1.7527958154678345, "logits/rejected": -1.6081327199935913, "logps/chosen": -597.8883666992188, "logps/rejected": -634.3714599609375, "loss": 0.4928, "rewards/accuracies": 0.75, "rewards/chosen": -2.129807472229004, "rewards/margins": 1.1348294019699097, "rewards/rejected": -3.2646374702453613, "step": 3400 }, { "epoch": 0.8181381957773513, "grad_norm": 9.295843025722531, "learning_rate": 4.871002693522486e-08, "logits/chosen": -1.6534563302993774, "logits/rejected": -1.5655500888824463, "logps/chosen": -467.03948974609375, "logps/rejected": -591.6249389648438, "loss": 0.4943, "rewards/accuracies": 0.75, "rewards/chosen": -1.7798774242401123, "rewards/margins": 1.6510403156280518, "rewards/rejected": -3.4309182167053223, "step": 3410 }, { "epoch": 0.8205374280230326, "grad_norm": 11.329859485907628, "learning_rate": 4.7475335095623956e-08, "logits/chosen": -1.6167590618133545, "logits/rejected": -1.5307334661483765, "logps/chosen": -470.58270263671875, "logps/rejected": -668.56640625, "loss": 0.4725, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9665504693984985, "rewards/margins": 2.131230592727661, "rewards/rejected": -4.097781181335449, "step": 3420 }, { "epoch": 0.822936660268714, "grad_norm": 20.11235991117846, "learning_rate": 4.6254849556646714e-08, "logits/chosen": -1.670371413230896, "logits/rejected": -1.4920861721038818, "logps/chosen": -548.06787109375, "logps/rejected": -802.3192138671875, "loss": 0.5013, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.2579779624938965, "rewards/margins": 3.0247139930725098, "rewards/rejected": -5.282691955566406, "step": 3430 }, { "epoch": 0.8253358925143954, "grad_norm": 11.445798775826706, "learning_rate": 4.504865593050483e-08, "logits/chosen": -1.6630195379257202, "logits/rejected": -1.5998250246047974, "logps/chosen": -498.11968994140625, "logps/rejected": -625.1238403320312, "loss": 0.5091, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1429293155670166, "rewards/margins": 1.2725276947021484, "rewards/rejected": -3.415456771850586, "step": 3440 }, { "epoch": 0.8277351247600768, "grad_norm": 11.346669302814137, "learning_rate": 4.385683882688895e-08, "logits/chosen": -1.723249077796936, "logits/rejected": -1.665723443031311, "logps/chosen": -476.0176696777344, "logps/rejected": -490.48779296875, "loss": 0.5342, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8500604629516602, "rewards/margins": 0.6172209978103638, "rewards/rejected": -2.4672813415527344, "step": 3450 }, { "epoch": 0.8301343570057581, "grad_norm": 15.969215547104008, "learning_rate": 4.2679481847033985e-08, "logits/chosen": -1.588076114654541, "logits/rejected": -1.5232843160629272, "logps/chosen": -461.88848876953125, "logps/rejected": -622.911865234375, "loss": 0.5174, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8116493225097656, "rewards/margins": 1.6616909503936768, "rewards/rejected": -3.4733402729034424, "step": 3460 }, { "epoch": 0.8325335892514395, "grad_norm": 11.072153331210435, "learning_rate": 4.151666757785435e-08, "logits/chosen": -1.5928449630737305, "logits/rejected": -1.5169405937194824, "logps/chosen": -427.0491638183594, "logps/rejected": -672.35107421875, "loss": 0.4681, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.7070322036743164, "rewards/margins": 2.5327906608581543, "rewards/rejected": -4.239823341369629, "step": 3470 }, { "epoch": 0.8349328214971209, "grad_norm": 9.49570882884042, "learning_rate": 4.036847758615136e-08, "logits/chosen": -1.6075379848480225, "logits/rejected": -1.5888822078704834, "logps/chosen": -491.94390869140625, "logps/rejected": -652.8501586914062, "loss": 0.5274, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.3509058952331543, "rewards/margins": 1.588134765625, "rewards/rejected": -3.939040422439575, "step": 3480 }, { "epoch": 0.8373320537428023, "grad_norm": 11.923530456024624, "learning_rate": 3.923499241289113e-08, "logits/chosen": -1.6823714971542358, "logits/rejected": -1.593367338180542, "logps/chosen": -547.1072998046875, "logps/rejected": -672.8248901367188, "loss": 0.5441, "rewards/accuracies": 0.75, "rewards/chosen": -2.304713249206543, "rewards/margins": 1.8532909154891968, "rewards/rejected": -4.158003807067871, "step": 3490 }, { "epoch": 0.8397312859884837, "grad_norm": 10.522005050954938, "learning_rate": 3.811629156755541e-08, "logits/chosen": -1.6868102550506592, "logits/rejected": -1.6846917867660522, "logps/chosen": -485.7550354003906, "logps/rejected": -589.5763549804688, "loss": 0.4966, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7846243381500244, "rewards/margins": 1.1891510486602783, "rewards/rejected": -2.9737753868103027, "step": 3500 }, { "epoch": 0.8421305182341651, "grad_norm": 9.525223827769382, "learning_rate": 3.701245352256391e-08, "logits/chosen": -1.6697797775268555, "logits/rejected": -1.6646827459335327, "logps/chosen": -471.4710998535156, "logps/rejected": -512.9531860351562, "loss": 0.4984, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.646978735923767, "rewards/margins": 0.6669288873672485, "rewards/rejected": -2.3139073848724365, "step": 3510 }, { "epoch": 0.8445297504798465, "grad_norm": 12.160402315077537, "learning_rate": 3.592355570776984e-08, "logits/chosen": -1.6833436489105225, "logits/rejected": -1.6594688892364502, "logps/chosen": -362.41400146484375, "logps/rejected": -500.51666259765625, "loss": 0.472, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2686312198638916, "rewards/margins": 1.350454568862915, "rewards/rejected": -2.6190860271453857, "step": 3520 }, { "epoch": 0.8469289827255279, "grad_norm": 8.805306061647622, "learning_rate": 3.484967450502904e-08, "logits/chosen": -1.5704456567764282, "logits/rejected": -1.5578100681304932, "logps/chosen": -376.8490905761719, "logps/rejected": -579.5418701171875, "loss": 0.4634, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.635333776473999, "rewards/margins": 1.6225509643554688, "rewards/rejected": -3.257884979248047, "step": 3530 }, { "epoch": 0.8493282149712092, "grad_norm": 12.572003529621947, "learning_rate": 3.3790885242841296e-08, "logits/chosen": -1.6139347553253174, "logits/rejected": -1.5461299419403076, "logps/chosen": -473.8414611816406, "logps/rejected": -705.4661865234375, "loss": 0.4651, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.090013027191162, "rewards/margins": 2.4070258140563965, "rewards/rejected": -4.4970383644104, "step": 3540 }, { "epoch": 0.8517274472168906, "grad_norm": 14.913426265894294, "learning_rate": 3.274726219106677e-08, "logits/chosen": -1.657772421836853, "logits/rejected": -1.6069705486297607, "logps/chosen": -506.9501037597656, "logps/rejected": -699.5079345703125, "loss": 0.4897, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0737128257751465, "rewards/margins": 2.0126349925994873, "rewards/rejected": -4.086348056793213, "step": 3550 }, { "epoch": 0.8541266794625719, "grad_norm": 13.051269160073966, "learning_rate": 3.171887855571642e-08, "logits/chosen": -1.714739203453064, "logits/rejected": -1.664184808731079, "logps/chosen": -409.457763671875, "logps/rejected": -502.5277404785156, "loss": 0.484, "rewards/accuracies": 0.75, "rewards/chosen": -1.7403171062469482, "rewards/margins": 0.9889610409736633, "rewards/rejected": -2.729278087615967, "step": 3560 }, { "epoch": 0.8565259117082533, "grad_norm": 9.061162365072834, "learning_rate": 3.070580647381643e-08, "logits/chosen": -1.6095638275146484, "logits/rejected": -1.5605108737945557, "logps/chosen": -442.99029541015625, "logps/rejected": -652.7391357421875, "loss": 0.5072, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9627918004989624, "rewards/margins": 2.100388288497925, "rewards/rejected": -4.063180446624756, "step": 3570 }, { "epoch": 0.8589251439539347, "grad_norm": 13.034931697815875, "learning_rate": 2.9708117008348576e-08, "logits/chosen": -1.6348292827606201, "logits/rejected": -1.5857656002044678, "logps/chosen": -506.25128173828125, "logps/rejected": -560.7322387695312, "loss": 0.4512, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7951558828353882, "rewards/margins": 1.0923255681991577, "rewards/rejected": -2.887481212615967, "step": 3580 }, { "epoch": 0.8613243761996161, "grad_norm": 16.47510836794499, "learning_rate": 2.8725880143264992e-08, "logits/chosen": -1.652269959449768, "logits/rejected": -1.6364988088607788, "logps/chosen": -473.0938415527344, "logps/rejected": -605.9546508789062, "loss": 0.5383, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1653454303741455, "rewards/margins": 0.9054538011550903, "rewards/rejected": -3.070798873901367, "step": 3590 }, { "epoch": 0.8637236084452975, "grad_norm": 16.558002870622833, "learning_rate": 2.775916477857948e-08, "logits/chosen": -1.5733797550201416, "logits/rejected": -1.5531814098358154, "logps/chosen": -409.64007568359375, "logps/rejected": -545.078857421875, "loss": 0.4997, "rewards/accuracies": 0.75, "rewards/chosen": -1.9042761325836182, "rewards/margins": 1.3297529220581055, "rewards/rejected": -3.2340290546417236, "step": 3600 }, { "epoch": 0.8661228406909789, "grad_norm": 17.718409075113666, "learning_rate": 2.680803872553408e-08, "logits/chosen": -1.6162738800048828, "logits/rejected": -1.422507643699646, "logps/chosen": -433.74639892578125, "logps/rejected": -798.6219482421875, "loss": 0.4981, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6543105840682983, "rewards/margins": 3.9157538414001465, "rewards/rejected": -5.570064067840576, "step": 3610 }, { "epoch": 0.8685220729366603, "grad_norm": 19.479387945921662, "learning_rate": 2.5872568701842706e-08, "logits/chosen": -1.5644371509552002, "logits/rejected": -1.5578614473342896, "logps/chosen": -391.2760314941406, "logps/rejected": -588.4986572265625, "loss": 0.4988, "rewards/accuracies": 0.75, "rewards/chosen": -1.6809015274047852, "rewards/margins": 1.7512578964233398, "rewards/rejected": -3.432159423828125, "step": 3620 }, { "epoch": 0.8709213051823417, "grad_norm": 15.579421392343054, "learning_rate": 2.495282032701096e-08, "logits/chosen": -1.626412034034729, "logits/rejected": -1.508430004119873, "logps/chosen": -362.73089599609375, "logps/rejected": -497.5302734375, "loss": 0.4601, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5209667682647705, "rewards/margins": 1.7109506130218506, "rewards/rejected": -3.231917142868042, "step": 3630 }, { "epoch": 0.8733205374280231, "grad_norm": 12.975221010220178, "learning_rate": 2.4048858117733133e-08, "logits/chosen": -1.6329269409179688, "logits/rejected": -1.472723126411438, "logps/chosen": -473.1459045410156, "logps/rejected": -683.5482788085938, "loss": 0.4439, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.913275957107544, "rewards/margins": 2.6152966022491455, "rewards/rejected": -4.528572082519531, "step": 3640 }, { "epoch": 0.8757197696737045, "grad_norm": 14.423256562483388, "learning_rate": 2.3160745483366938e-08, "logits/chosen": -1.6391162872314453, "logits/rejected": -1.671979546546936, "logps/chosen": -443.46258544921875, "logps/rejected": -672.6021118164062, "loss": 0.476, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9398761987686157, "rewards/margins": 2.000032901763916, "rewards/rejected": -3.9399094581604004, "step": 3650 }, { "epoch": 0.8781190019193857, "grad_norm": 9.425763927010763, "learning_rate": 2.2288544721485197e-08, "logits/chosen": -1.6494081020355225, "logits/rejected": -1.5929086208343506, "logps/chosen": -396.45013427734375, "logps/rejected": -661.6722412109375, "loss": 0.4608, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.6658204793930054, "rewards/margins": 2.458636999130249, "rewards/rejected": -4.124457359313965, "step": 3660 }, { "epoch": 0.8805182341650671, "grad_norm": 21.417485675065244, "learning_rate": 2.1432317013506117e-08, "logits/chosen": -1.740401268005371, "logits/rejected": -1.6570911407470703, "logps/chosen": -494.31964111328125, "logps/rejected": -612.0262451171875, "loss": 0.5057, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1772966384887695, "rewards/margins": 1.6181838512420654, "rewards/rejected": -3.795480728149414, "step": 3670 }, { "epoch": 0.8829174664107485, "grad_norm": 16.00309629191891, "learning_rate": 2.0592122420401704e-08, "logits/chosen": -1.5891798734664917, "logits/rejected": -1.5248258113861084, "logps/chosen": -467.4602966308594, "logps/rejected": -596.62060546875, "loss": 0.526, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2219550609588623, "rewards/margins": 1.2901359796524048, "rewards/rejected": -3.5120906829833984, "step": 3680 }, { "epoch": 0.8853166986564299, "grad_norm": 13.429038672487438, "learning_rate": 1.976801987848459e-08, "logits/chosen": -1.6623175144195557, "logits/rejected": -1.6009843349456787, "logps/chosen": -486.54345703125, "logps/rejected": -719.8893432617188, "loss": 0.5037, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.023263931274414, "rewards/margins": 2.204184055328369, "rewards/rejected": -4.227447986602783, "step": 3690 }, { "epoch": 0.8877159309021113, "grad_norm": 26.48164665804949, "learning_rate": 1.8960067195273987e-08, "logits/chosen": -1.6708219051361084, "logits/rejected": -1.5749927759170532, "logps/chosen": -400.6322937011719, "logps/rejected": -630.9937133789062, "loss": 0.5018, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7153675556182861, "rewards/margins": 2.3315227031707764, "rewards/rejected": -4.0468902587890625, "step": 3700 }, { "epoch": 0.8901151631477927, "grad_norm": 11.411704364971511, "learning_rate": 1.816832104544072e-08, "logits/chosen": -1.564396619796753, "logits/rejected": -1.4874539375305176, "logps/chosen": -481.55242919921875, "logps/rejected": -635.627685546875, "loss": 0.4544, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.952723503112793, "rewards/margins": 1.7845569849014282, "rewards/rejected": -3.7372806072235107, "step": 3710 }, { "epoch": 0.8925143953934741, "grad_norm": 15.030315396145902, "learning_rate": 1.7392836966831553e-08, "logits/chosen": -1.5914599895477295, "logits/rejected": -1.4839370250701904, "logps/chosen": -488.05621337890625, "logps/rejected": -683.0465698242188, "loss": 0.465, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.0653412342071533, "rewards/margins": 2.3046443462371826, "rewards/rejected": -4.369986057281494, "step": 3720 }, { "epoch": 0.8949136276391555, "grad_norm": 11.782758949527297, "learning_rate": 1.663366935657373e-08, "logits/chosen": -1.6213829517364502, "logits/rejected": -1.4937806129455566, "logps/chosen": -420.52911376953125, "logps/rejected": -647.6318969726562, "loss": 0.514, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7656971216201782, "rewards/margins": 2.258690357208252, "rewards/rejected": -4.024387836456299, "step": 3730 }, { "epoch": 0.8973128598848369, "grad_norm": 19.85256790289952, "learning_rate": 1.5890871467258898e-08, "logits/chosen": -1.6818307638168335, "logits/rejected": -1.6265443563461304, "logps/chosen": -564.2450561523438, "logps/rejected": -681.9746704101562, "loss": 0.5019, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.256525754928589, "rewards/margins": 1.55268394947052, "rewards/rejected": -3.8092098236083984, "step": 3740 }, { "epoch": 0.8997120921305183, "grad_norm": 12.586313589978424, "learning_rate": 1.5164495403207967e-08, "logits/chosen": -1.700171709060669, "logits/rejected": -1.6361474990844727, "logps/chosen": -475.1417541503906, "logps/rejected": -746.2467041015625, "loss": 0.4623, "rewards/accuracies": 0.75, "rewards/chosen": -2.0661838054656982, "rewards/margins": 2.443232297897339, "rewards/rejected": -4.509416103363037, "step": 3750 }, { "epoch": 0.9021113243761996, "grad_norm": 12.787222537511584, "learning_rate": 1.4454592116815962e-08, "logits/chosen": -1.593766450881958, "logits/rejected": -1.5601489543914795, "logps/chosen": -449.85858154296875, "logps/rejected": -639.9388427734375, "loss": 0.445, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.833129644393921, "rewards/margins": 1.7945473194122314, "rewards/rejected": -3.6276767253875732, "step": 3760 }, { "epoch": 0.904510556621881, "grad_norm": 10.429284153734484, "learning_rate": 1.3761211404977934e-08, "logits/chosen": -1.5752068758010864, "logits/rejected": -1.422378659248352, "logps/chosen": -471.98876953125, "logps/rejected": -817.8197021484375, "loss": 0.442, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2775814533233643, "rewards/margins": 3.5992565155029297, "rewards/rejected": -5.876837730407715, "step": 3770 }, { "epoch": 0.9069097888675623, "grad_norm": 12.741709586667612, "learning_rate": 1.3084401905596177e-08, "logits/chosen": -1.7176425457000732, "logits/rejected": -1.6414988040924072, "logps/chosen": -503.9371643066406, "logps/rejected": -640.4576416015625, "loss": 0.4707, "rewards/accuracies": 0.75, "rewards/chosen": -1.8939365148544312, "rewards/margins": 1.9764692783355713, "rewards/rejected": -3.870405912399292, "step": 3780 }, { "epoch": 0.9093090211132437, "grad_norm": 11.675903630631732, "learning_rate": 1.2424211094168053e-08, "logits/chosen": -1.6118358373641968, "logits/rejected": -1.5552772283554077, "logps/chosen": -522.3504638671875, "logps/rejected": -701.8868408203125, "loss": 0.5099, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.871604323387146, "rewards/margins": 1.9561458826065063, "rewards/rejected": -3.8277504444122314, "step": 3790 }, { "epoch": 0.9117082533589251, "grad_norm": 11.816100782837214, "learning_rate": 1.1780685280456143e-08, "logits/chosen": -1.6721569299697876, "logits/rejected": -1.6179077625274658, "logps/chosen": -539.4227905273438, "logps/rejected": -888.7571411132812, "loss": 0.4959, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.298069477081299, "rewards/margins": 3.5056235790252686, "rewards/rejected": -5.803693771362305, "step": 3800 }, { "epoch": 0.9141074856046065, "grad_norm": 11.727624985469124, "learning_rate": 1.1153869605239564e-08, "logits/chosen": -1.6193329095840454, "logits/rejected": -1.5656651258468628, "logps/chosen": -435.56158447265625, "logps/rejected": -478.83770751953125, "loss": 0.4958, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5856086015701294, "rewards/margins": 0.8730261921882629, "rewards/rejected": -2.458634614944458, "step": 3810 }, { "epoch": 0.9165067178502879, "grad_norm": 16.85961078114398, "learning_rate": 1.0543808037147606e-08, "logits/chosen": -1.6750271320343018, "logits/rejected": -1.6107738018035889, "logps/chosen": -503.64471435546875, "logps/rejected": -785.1126708984375, "loss": 0.4807, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.2831358909606934, "rewards/margins": 2.812958240509033, "rewards/rejected": -5.096093654632568, "step": 3820 }, { "epoch": 0.9189059500959693, "grad_norm": 12.461972445162298, "learning_rate": 9.95054336957557e-09, "logits/chosen": -1.734662652015686, "logits/rejected": -1.739912748336792, "logps/chosen": -434.0250549316406, "logps/rejected": -562.7544555664062, "loss": 0.4761, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7552568912506104, "rewards/margins": 1.2200746536254883, "rewards/rejected": -2.9753317832946777, "step": 3830 }, { "epoch": 0.9213051823416507, "grad_norm": 12.522523399418205, "learning_rate": 9.37411721768286e-09, "logits/chosen": -1.5948150157928467, "logits/rejected": -1.5764580965042114, "logps/chosen": -513.3563232421875, "logps/rejected": -767.2200927734375, "loss": 0.474, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.226046085357666, "rewards/margins": 2.2817482948303223, "rewards/rejected": -4.507794380187988, "step": 3840 }, { "epoch": 0.9237044145873321, "grad_norm": 10.9488041002692, "learning_rate": 8.81457001547392e-09, "logits/chosen": -1.6462711095809937, "logits/rejected": -1.6491082906723022, "logps/chosen": -459.8772888183594, "logps/rejected": -560.5117797851562, "loss": 0.4729, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.011331558227539, "rewards/margins": 0.9149211645126343, "rewards/rejected": -2.9262523651123047, "step": 3850 }, { "epoch": 0.9261036468330134, "grad_norm": 9.649161518168588, "learning_rate": 8.271941012961942e-09, "logits/chosen": -1.5380823612213135, "logits/rejected": -1.412889838218689, "logps/chosen": -441.87335205078125, "logps/rejected": -816.4763793945312, "loss": 0.4613, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1320416927337646, "rewards/margins": 3.2392337322235107, "rewards/rejected": -5.371275424957275, "step": 3860 }, { "epoch": 0.9285028790786948, "grad_norm": 11.267633686817604, "learning_rate": 7.746268273415568e-09, "logits/chosen": -1.648723840713501, "logits/rejected": -1.6860382556915283, "logps/chosen": -460.56036376953125, "logps/rejected": -567.8131103515625, "loss": 0.4806, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.851995825767517, "rewards/margins": 0.5843394994735718, "rewards/rejected": -2.436335325241089, "step": 3870 }, { "epoch": 0.9309021113243762, "grad_norm": 11.04068659513372, "learning_rate": 7.237588670689076e-09, "logits/chosen": -1.7109369039535522, "logits/rejected": -1.5930227041244507, "logps/chosen": -468.35186767578125, "logps/rejected": -685.10986328125, "loss": 0.4614, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9170551300048828, "rewards/margins": 2.5844249725341797, "rewards/rejected": -4.5014801025390625, "step": 3880 }, { "epoch": 0.9333013435700576, "grad_norm": 14.964168805394552, "learning_rate": 6.745937886635606e-09, "logits/chosen": -1.6948446035385132, "logits/rejected": -1.5991394519805908, "logps/chosen": -517.13623046875, "logps/rejected": -857.3069458007812, "loss": 0.4527, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.1508491039276123, "rewards/margins": 3.447594404220581, "rewards/rejected": -5.598443508148193, "step": 3890 }, { "epoch": 0.935700575815739, "grad_norm": 14.883238902792032, "learning_rate": 6.271350408604409e-09, "logits/chosen": -1.6298729181289673, "logits/rejected": -1.5697710514068604, "logps/chosen": -407.47216796875, "logps/rejected": -590.4325561523438, "loss": 0.4973, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.779510259628296, "rewards/margins": 1.6561663150787354, "rewards/rejected": -3.4356765747070312, "step": 3900 }, { "epoch": 0.9380998080614203, "grad_norm": 11.184941495697121, "learning_rate": 5.813859527021487e-09, "logits/chosen": -1.5353864431381226, "logits/rejected": -1.5042493343353271, "logps/chosen": -483.96435546875, "logps/rejected": -689.84130859375, "loss": 0.4324, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.045846462249756, "rewards/margins": 2.3413102626800537, "rewards/rejected": -4.387156963348389, "step": 3910 }, { "epoch": 0.9404990403071017, "grad_norm": 14.299734951667778, "learning_rate": 5.373497333054616e-09, "logits/chosen": -1.6646445989608765, "logits/rejected": -1.649578332901001, "logps/chosen": -482.82843017578125, "logps/rejected": -551.9417724609375, "loss": 0.489, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0101306438446045, "rewards/margins": 0.7671645879745483, "rewards/rejected": -2.777294874191284, "step": 3920 }, { "epoch": 0.9428982725527831, "grad_norm": 10.602051511979361, "learning_rate": 4.950294716362213e-09, "logits/chosen": -1.6949056386947632, "logits/rejected": -1.670143723487854, "logps/chosen": -504.10821533203125, "logps/rejected": -612.98876953125, "loss": 0.4873, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.004544734954834, "rewards/margins": 1.1068131923675537, "rewards/rejected": -3.111358165740967, "step": 3930 }, { "epoch": 0.9452975047984645, "grad_norm": 10.528232336553208, "learning_rate": 4.544281362926422e-09, "logits/chosen": -1.7100117206573486, "logits/rejected": -1.6325490474700928, "logps/chosen": -508.181640625, "logps/rejected": -681.7391967773438, "loss": 0.5311, "rewards/accuracies": 0.75, "rewards/chosen": -1.9281724691390991, "rewards/margins": 1.9070625305175781, "rewards/rejected": -3.835235595703125, "step": 3940 }, { "epoch": 0.9476967370441459, "grad_norm": 11.26583083013792, "learning_rate": 4.15548575297095e-09, "logits/chosen": -1.668421745300293, "logits/rejected": -1.5268778800964355, "logps/chosen": -452.11077880859375, "logps/rejected": -725.6536254882812, "loss": 0.4537, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9140384197235107, "rewards/margins": 2.8626558780670166, "rewards/rejected": -4.776694297790527, "step": 3950 }, { "epoch": 0.9500959692898272, "grad_norm": 9.866263268327208, "learning_rate": 3.7839351589631366e-09, "logits/chosen": -1.6612355709075928, "logits/rejected": -1.623468041419983, "logps/chosen": -477.5087890625, "logps/rejected": -704.5999755859375, "loss": 0.5035, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.534862518310547, "rewards/margins": 1.8013521432876587, "rewards/rejected": -4.336214065551758, "step": 3960 }, { "epoch": 0.9524952015355086, "grad_norm": 11.105179609469939, "learning_rate": 3.4296556437010405e-09, "logits/chosen": -1.6372588872909546, "logits/rejected": -1.5610383749008179, "logps/chosen": -448.084228515625, "logps/rejected": -620.5638427734375, "loss": 0.4961, "rewards/accuracies": 0.75, "rewards/chosen": -2.273430585861206, "rewards/margins": 1.8218492269515991, "rewards/rejected": -4.095280170440674, "step": 3970 }, { "epoch": 0.95489443378119, "grad_norm": 15.432439336051313, "learning_rate": 3.092672058485124e-09, "logits/chosen": -1.6299177408218384, "logits/rejected": -1.5185959339141846, "logps/chosen": -480.63623046875, "logps/rejected": -749.26171875, "loss": 0.5051, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.3645920753479004, "rewards/margins": 2.5585074424743652, "rewards/rejected": -4.923099517822266, "step": 3980 }, { "epoch": 0.9572936660268714, "grad_norm": 13.295737344758477, "learning_rate": 2.7730080413750356e-09, "logits/chosen": -1.6487003564834595, "logits/rejected": -1.616842269897461, "logps/chosen": -471.357421875, "logps/rejected": -603.6805419921875, "loss": 0.4891, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8420718908309937, "rewards/margins": 1.3825440406799316, "rewards/rejected": -3.2246158123016357, "step": 3990 }, { "epoch": 0.9596928982725528, "grad_norm": 11.019669520177573, "learning_rate": 2.4706860155316033e-09, "logits/chosen": -1.641543984413147, "logits/rejected": -1.5944894552230835, "logps/chosen": -602.7939453125, "logps/rejected": -756.920166015625, "loss": 0.4894, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.417752504348755, "rewards/margins": 1.6268641948699951, "rewards/rejected": -4.04461669921875, "step": 4000 }, { "epoch": 0.9596928982725528, "eval_logits/chosen": -1.5951924324035645, "eval_logits/rejected": -1.5356674194335938, "eval_logps/chosen": -472.6584167480469, "eval_logps/rejected": -691.1746215820312, "eval_loss": 0.4801708161830902, "eval_rewards/accuracies": 0.7928571701049805, "eval_rewards/chosen": -2.003549337387085, "eval_rewards/margins": 2.17158842086792, "eval_rewards/rejected": -4.175137996673584, "eval_runtime": 45.9357, "eval_samples_per_second": 97.114, "eval_steps_per_second": 1.524, "step": 4000 }, { "epoch": 0.9620921305182342, "grad_norm": 11.233710733457718, "learning_rate": 2.185727187643843e-09, "logits/chosen": -1.6332000494003296, "logits/rejected": -1.5429205894470215, "logps/chosen": -419.66815185546875, "logps/rejected": -734.4024658203125, "loss": 0.4842, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.8838298320770264, "rewards/margins": 3.1948580741882324, "rewards/rejected": -5.078688144683838, "step": 4010 }, { "epoch": 0.9644913627639156, "grad_norm": 15.328579684902264, "learning_rate": 1.9181515464413434e-09, "logits/chosen": -1.670598030090332, "logits/rejected": -1.6298954486846924, "logps/chosen": -588.2853393554688, "logps/rejected": -813.2299194335938, "loss": 0.4958, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.263362407684326, "rewards/margins": 2.3236560821533203, "rewards/rejected": -4.5870184898376465, "step": 4020 }, { "epoch": 0.966890595009597, "grad_norm": 13.812423991968044, "learning_rate": 1.6679778612923302e-09, "logits/chosen": -1.752912163734436, "logits/rejected": -1.669989824295044, "logps/chosen": -523.4584350585938, "logps/rejected": -659.4483642578125, "loss": 0.4615, "rewards/accuracies": 0.75, "rewards/chosen": -2.229353427886963, "rewards/margins": 1.3204247951507568, "rewards/rejected": -3.549778699874878, "step": 4030 }, { "epoch": 0.9692898272552783, "grad_norm": 19.733776466762233, "learning_rate": 1.43522368088686e-09, "logits/chosen": -1.619080901145935, "logits/rejected": -1.4254459142684937, "logps/chosen": -500.8919982910156, "logps/rejected": -871.8958740234375, "loss": 0.5482, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.26896595954895, "rewards/margins": 3.7620201110839844, "rewards/rejected": -6.030986309051514, "step": 4040 }, { "epoch": 0.9716890595009597, "grad_norm": 13.902504420645041, "learning_rate": 1.2199053320059993e-09, "logits/chosen": -1.6571025848388672, "logits/rejected": -1.6325149536132812, "logps/chosen": -482.8365783691406, "logps/rejected": -686.3687744140625, "loss": 0.4742, "rewards/accuracies": 0.875, "rewards/chosen": -1.980038046836853, "rewards/margins": 1.896106481552124, "rewards/rejected": -3.8761448860168457, "step": 4050 }, { "epoch": 0.974088291746641, "grad_norm": 10.386888230304015, "learning_rate": 1.0220379183764338e-09, "logits/chosen": -1.6559547185897827, "logits/rejected": -1.545379638671875, "logps/chosen": -393.8249206542969, "logps/rejected": -653.940185546875, "loss": 0.4873, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7048404216766357, "rewards/margins": 2.559105396270752, "rewards/rejected": -4.263945579528809, "step": 4060 }, { "epoch": 0.9764875239923224, "grad_norm": 13.972760200119874, "learning_rate": 8.416353196111503e-10, "logits/chosen": -1.4547879695892334, "logits/rejected": -1.3284496068954468, "logps/chosen": -491.6461486816406, "logps/rejected": -743.1737670898438, "loss": 0.5082, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.368063449859619, "rewards/margins": 2.6352427005767822, "rewards/rejected": -5.0033063888549805, "step": 4070 }, { "epoch": 0.9788867562380038, "grad_norm": 11.880689150494451, "learning_rate": 6.787101902356873e-10, "logits/chosen": -1.5535688400268555, "logits/rejected": -1.5120995044708252, "logps/chosen": -489.68719482421875, "logps/rejected": -716.9655151367188, "loss": 0.4804, "rewards/accuracies": 0.75, "rewards/chosen": -2.1152749061584473, "rewards/margins": 2.0652334690093994, "rewards/rejected": -4.180508613586426, "step": 4080 }, { "epoch": 0.9812859884836852, "grad_norm": 18.79541252367771, "learning_rate": 5.332739588005953e-10, "logits/chosen": -1.630657434463501, "logits/rejected": -1.5946576595306396, "logps/chosen": -391.0518493652344, "logps/rejected": -655.5036010742188, "loss": 0.4848, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7571265697479248, "rewards/margins": 2.442540407180786, "rewards/rejected": -4.199666976928711, "step": 4090 }, { "epoch": 0.9836852207293666, "grad_norm": 18.62380663413524, "learning_rate": 4.053368270797164e-10, "logits/chosen": -1.5710262060165405, "logits/rejected": -1.495965838432312, "logps/chosen": -468.63946533203125, "logps/rejected": -698.932373046875, "loss": 0.4696, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.210425853729248, "rewards/margins": 2.375892162322998, "rewards/rejected": -4.586318016052246, "step": 4100 }, { "epoch": 0.986084452975048, "grad_norm": 10.138418324575126, "learning_rate": 2.949077693545354e-10, "logits/chosen": -1.5962473154067993, "logits/rejected": -1.61128830909729, "logps/chosen": -506.21630859375, "logps/rejected": -674.5911865234375, "loss": 0.5628, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2012012004852295, "rewards/margins": 1.3675811290740967, "rewards/rejected": -3.568782091140747, "step": 4110 }, { "epoch": 0.9884836852207294, "grad_norm": 11.981782992200987, "learning_rate": 2.0199453178471047e-10, "logits/chosen": -1.6453485488891602, "logits/rejected": -1.6295568943023682, "logps/chosen": -540.22607421875, "logps/rejected": -574.7904663085938, "loss": 0.4926, "rewards/accuracies": 0.75, "rewards/chosen": -2.1138644218444824, "rewards/margins": 0.7879716157913208, "rewards/rejected": -2.9018359184265137, "step": 4120 }, { "epoch": 0.9908829174664108, "grad_norm": 15.974370111697047, "learning_rate": 1.266036318647301e-10, "logits/chosen": -1.632550835609436, "logits/rejected": -1.550283670425415, "logps/chosen": -528.643310546875, "logps/rejected": -716.7777709960938, "loss": 0.49, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.1183180809020996, "rewards/margins": 2.1329312324523926, "rewards/rejected": -4.25124979019165, "step": 4130 }, { "epoch": 0.9932821497120922, "grad_norm": 15.469000968932196, "learning_rate": 6.874035796672339e-11, "logits/chosen": -1.6695518493652344, "logits/rejected": -1.5455235242843628, "logps/chosen": -510.795654296875, "logps/rejected": -794.1751708984375, "loss": 0.4936, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.969916582107544, "rewards/margins": 3.434035539627075, "rewards/rejected": -5.403952121734619, "step": 4140 }, { "epoch": 0.9956813819577736, "grad_norm": 13.801035998962295, "learning_rate": 2.8408768969423458e-11, "logits/chosen": -1.7181600332260132, "logits/rejected": -1.696080207824707, "logps/chosen": -485.26495361328125, "logps/rejected": -656.0448608398438, "loss": 0.4756, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.976780652999878, "rewards/margins": 1.6538575887680054, "rewards/rejected": -3.6306381225585938, "step": 4150 }, { "epoch": 0.9980806142034548, "grad_norm": 11.770076455666835, "learning_rate": 5.611693973617271e-12, "logits/chosen": -1.54105544090271, "logits/rejected": -1.4928066730499268, "logps/chosen": -425.23748779296875, "logps/rejected": -606.849853515625, "loss": 0.5225, "rewards/accuracies": 0.75, "rewards/chosen": -1.8429193496704102, "rewards/margins": 1.6649690866470337, "rewards/rejected": -3.5078887939453125, "step": 4160 }, { "epoch": 1.0, "step": 4168, "total_flos": 0.0, "train_loss": 0.5327433187535995, "train_runtime": 4228.4167, "train_samples_per_second": 31.541, "train_steps_per_second": 0.986 } ], "logging_steps": 10, "max_steps": 4168, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }