{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 2000, "global_step": 5733, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005232862375719519, "grad_norm": 89.4226840742472, "learning_rate": 8.710801393728223e-10, "logits/chosen": -3.4411821365356445, "logits/rejected": -3.41083025932312, "logps/chosen": -501.4610595703125, "logps/rejected": -596.95849609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0052328623757195184, "grad_norm": 85.25418359860987, "learning_rate": 8.710801393728223e-09, "logits/chosen": -3.0671284198760986, "logits/rejected": -3.074446439743042, "logps/chosen": -335.681884765625, "logps/rejected": -280.1573486328125, "loss": 0.6916, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": -0.0020227970089763403, "rewards/margins": -0.0008920154068619013, "rewards/rejected": -0.00113078230060637, "step": 10 }, { "epoch": 0.010465724751439037, "grad_norm": 81.81916485978738, "learning_rate": 1.7421602787456446e-08, "logits/chosen": -3.0613160133361816, "logits/rejected": -3.0616378784179688, "logps/chosen": -226.2029266357422, "logps/rejected": -215.268310546875, "loss": 0.6941, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.0005527756875380874, "rewards/margins": -0.004833672661334276, "rewards/rejected": 0.00428089639171958, "step": 20 }, { "epoch": 0.015698587127158554, "grad_norm": 80.43917140017227, "learning_rate": 2.6132404181184667e-08, "logits/chosen": -2.9582347869873047, "logits/rejected": -2.9460692405700684, "logps/chosen": -301.1156921386719, "logps/rejected": -276.5864562988281, "loss": 0.696, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.005320440977811813, "rewards/margins": -0.010556185618042946, "rewards/rejected": 0.005235743708908558, "step": 30 }, { "epoch": 0.020931449502878074, "grad_norm": 82.2617022545516, "learning_rate": 3.484320557491289e-08, "logits/chosen": -3.1444554328918457, "logits/rejected": -3.0621819496154785, "logps/chosen": -316.7799072265625, "logps/rejected": -308.12225341796875, "loss": 0.6921, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0006299316883087158, "rewards/margins": -0.0017468839650973678, "rewards/rejected": 0.0023768178652971983, "step": 40 }, { "epoch": 0.026164311878597593, "grad_norm": 88.75625096118156, "learning_rate": 4.355400696864111e-08, "logits/chosen": -3.1654610633850098, "logits/rejected": -3.069864273071289, "logps/chosen": -297.98040771484375, "logps/rejected": -271.53985595703125, "loss": 0.6916, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0006378920515999198, "rewards/margins": -0.0062376875430345535, "rewards/rejected": 0.005599795840680599, "step": 50 }, { "epoch": 0.03139717425431711, "grad_norm": 87.15103878265461, "learning_rate": 5.2264808362369334e-08, "logits/chosen": -3.0595157146453857, "logits/rejected": -3.0343680381774902, "logps/chosen": -270.9585266113281, "logps/rejected": -266.93341064453125, "loss": 0.6937, "rewards/accuracies": 0.5, "rewards/chosen": -0.0004598981759045273, "rewards/margins": 0.007152448408305645, "rewards/rejected": -0.007612347602844238, "step": 60 }, { "epoch": 0.03663003663003663, "grad_norm": 91.63896190919405, "learning_rate": 6.097560975609756e-08, "logits/chosen": -3.1611061096191406, "logits/rejected": -3.1091177463531494, "logps/chosen": -331.9501953125, "logps/rejected": -266.3929443359375, "loss": 0.6923, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0032606697641313076, "rewards/margins": 0.007664867676794529, "rewards/rejected": -0.004404196981340647, "step": 70 }, { "epoch": 0.04186289900575615, "grad_norm": 87.0032758925652, "learning_rate": 6.968641114982578e-08, "logits/chosen": -3.1067955493927, "logits/rejected": -3.0868916511535645, "logps/chosen": -309.2652587890625, "logps/rejected": -304.4024658203125, "loss": 0.6925, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.005191360600292683, "rewards/margins": -0.012618144042789936, "rewards/rejected": 0.007426784373819828, "step": 80 }, { "epoch": 0.04709576138147567, "grad_norm": 76.85073060086604, "learning_rate": 7.8397212543554e-08, "logits/chosen": -3.1143388748168945, "logits/rejected": -3.089191436767578, "logps/chosen": -245.3241729736328, "logps/rejected": -213.27847290039062, "loss": 0.6943, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0026218655984848738, "rewards/margins": -0.01214824989438057, "rewards/rejected": 0.009526384063065052, "step": 90 }, { "epoch": 0.052328623757195186, "grad_norm": 75.21297729566463, "learning_rate": 8.710801393728223e-08, "logits/chosen": -3.1238529682159424, "logits/rejected": -3.032531261444092, "logps/chosen": -222.70230102539062, "logps/rejected": -198.76942443847656, "loss": 0.6912, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.009502505883574486, "rewards/margins": 0.005166308954358101, "rewards/rejected": 0.00433619599789381, "step": 100 }, { "epoch": 0.0575614861329147, "grad_norm": 81.00825257521805, "learning_rate": 9.581881533101045e-08, "logits/chosen": -3.024508237838745, "logits/rejected": -2.944667100906372, "logps/chosen": -261.4869689941406, "logps/rejected": -196.82199096679688, "loss": 0.6884, "rewards/accuracies": 0.5, "rewards/chosen": 0.003925251308828592, "rewards/margins": 0.0104664396494627, "rewards/rejected": -0.00654118787497282, "step": 110 }, { "epoch": 0.06279434850863422, "grad_norm": 80.13359491535431, "learning_rate": 1.0452961672473867e-07, "logits/chosen": -3.1580023765563965, "logits/rejected": -3.1674582958221436, "logps/chosen": -369.6051025390625, "logps/rejected": -341.3282775878906, "loss": 0.69, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.014734138734638691, "rewards/margins": 0.004716642666608095, "rewards/rejected": 0.010017497465014458, "step": 120 }, { "epoch": 0.06802721088435375, "grad_norm": 78.08306009389258, "learning_rate": 1.132404181184669e-07, "logits/chosen": -3.0129623413085938, "logits/rejected": -3.02490496635437, "logps/chosen": -228.000732421875, "logps/rejected": -237.35671997070312, "loss": 0.6871, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0005366698605939746, "rewards/margins": 0.005095045547932386, "rewards/rejected": -0.0056317150592803955, "step": 130 }, { "epoch": 0.07326007326007326, "grad_norm": 81.65286685717442, "learning_rate": 1.219512195121951e-07, "logits/chosen": -2.976260185241699, "logits/rejected": -3.0033462047576904, "logps/chosen": -299.4603271484375, "logps/rejected": -309.2967834472656, "loss": 0.6814, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.014314761385321617, "rewards/margins": 0.03897615149617195, "rewards/rejected": -0.024661390110850334, "step": 140 }, { "epoch": 0.07849293563579278, "grad_norm": 87.63009082461357, "learning_rate": 1.3066202090592334e-07, "logits/chosen": -3.11221981048584, "logits/rejected": -3.030838966369629, "logps/chosen": -263.18414306640625, "logps/rejected": -247.90829467773438, "loss": 0.679, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00454263761639595, "rewards/margins": 0.012525680474936962, "rewards/rejected": -0.007983044721186161, "step": 150 }, { "epoch": 0.0837257980115123, "grad_norm": 83.43471866283302, "learning_rate": 1.3937282229965157e-07, "logits/chosen": -3.123751401901245, "logits/rejected": -3.020327091217041, "logps/chosen": -263.2129821777344, "logps/rejected": -234.9867401123047, "loss": 0.6787, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.018486974760890007, "rewards/margins": 0.03244725987315178, "rewards/rejected": -0.013960284180939198, "step": 160 }, { "epoch": 0.08895866038723181, "grad_norm": 91.86378356958795, "learning_rate": 1.480836236933798e-07, "logits/chosen": -3.0995914936065674, "logits/rejected": -3.1066174507141113, "logps/chosen": -232.8445587158203, "logps/rejected": -264.4934387207031, "loss": 0.6779, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.027189353480935097, "rewards/margins": 0.023355795070528984, "rewards/rejected": 0.003833554685115814, "step": 170 }, { "epoch": 0.09419152276295134, "grad_norm": 71.30546934223756, "learning_rate": 1.56794425087108e-07, "logits/chosen": -3.0807604789733887, "logits/rejected": -2.986154317855835, "logps/chosen": -292.7318115234375, "logps/rejected": -265.77398681640625, "loss": 0.6769, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.01179732196033001, "rewards/margins": 0.025500575080513954, "rewards/rejected": -0.037297897040843964, "step": 180 }, { "epoch": 0.09942438513867086, "grad_norm": 93.44784108972972, "learning_rate": 1.6550522648083622e-07, "logits/chosen": -3.105851650238037, "logits/rejected": -3.0132524967193604, "logps/chosen": -304.61431884765625, "logps/rejected": -257.0717468261719, "loss": 0.6628, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03316402807831764, "rewards/margins": 0.07553298771381378, "rewards/rejected": -0.042368970811367035, "step": 190 }, { "epoch": 0.10465724751439037, "grad_norm": 76.9350271560798, "learning_rate": 1.7421602787456445e-07, "logits/chosen": -3.0060136318206787, "logits/rejected": -2.927907943725586, "logps/chosen": -272.8405456542969, "logps/rejected": -294.12884521484375, "loss": 0.6579, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.027592217549681664, "rewards/margins": 0.045709364116191864, "rewards/rejected": -0.018117140978574753, "step": 200 }, { "epoch": 0.10989010989010989, "grad_norm": 79.74730199073873, "learning_rate": 1.8292682926829268e-07, "logits/chosen": -3.067862033843994, "logits/rejected": -3.0073494911193848, "logps/chosen": -213.41064453125, "logps/rejected": -196.45144653320312, "loss": 0.6739, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.040506668388843536, "rewards/margins": 0.042282260954380035, "rewards/rejected": -0.0017756022280082107, "step": 210 }, { "epoch": 0.1151229722658294, "grad_norm": 75.14415509426098, "learning_rate": 1.916376306620209e-07, "logits/chosen": -3.0948104858398438, "logits/rejected": -3.0583202838897705, "logps/chosen": -280.3541564941406, "logps/rejected": -244.63613891601562, "loss": 0.6642, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1134084090590477, "rewards/margins": 0.13926784694194794, "rewards/rejected": -0.02585943415760994, "step": 220 }, { "epoch": 0.12035583464154893, "grad_norm": 72.03951802017211, "learning_rate": 2.003484320557491e-07, "logits/chosen": -3.0906310081481934, "logits/rejected": -3.0797536373138428, "logps/chosen": -245.0637664794922, "logps/rejected": -323.5065002441406, "loss": 0.6347, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.05503677576780319, "rewards/margins": 0.11071760952472687, "rewards/rejected": -0.05568081885576248, "step": 230 }, { "epoch": 0.12558869701726844, "grad_norm": 79.16527483688573, "learning_rate": 2.0905923344947734e-07, "logits/chosen": -3.070136547088623, "logits/rejected": -2.9956440925598145, "logps/chosen": -232.2843780517578, "logps/rejected": -218.2404022216797, "loss": 0.643, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.028701787814497948, "rewards/margins": 0.06086137890815735, "rewards/rejected": -0.08956316113471985, "step": 240 }, { "epoch": 0.13082155939298795, "grad_norm": 81.59284182543736, "learning_rate": 2.1777003484320556e-07, "logits/chosen": -2.998007297515869, "logits/rejected": -2.99882435798645, "logps/chosen": -290.11260986328125, "logps/rejected": -250.94894409179688, "loss": 0.6278, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.07421593368053436, "rewards/margins": 0.2201288938522339, "rewards/rejected": -0.14591297507286072, "step": 250 }, { "epoch": 0.1360544217687075, "grad_norm": 93.95787403415741, "learning_rate": 2.264808362369338e-07, "logits/chosen": -3.1079845428466797, "logits/rejected": -3.042844533920288, "logps/chosen": -289.906982421875, "logps/rejected": -270.2817077636719, "loss": 0.6468, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14222319424152374, "rewards/margins": 0.25852373242378235, "rewards/rejected": -0.1163005456328392, "step": 260 }, { "epoch": 0.141287284144427, "grad_norm": 63.76087493785573, "learning_rate": 2.3519163763066202e-07, "logits/chosen": -3.0660629272460938, "logits/rejected": -3.0115866661071777, "logps/chosen": -273.26483154296875, "logps/rejected": -275.1770935058594, "loss": 0.6189, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.045087672770023346, "rewards/margins": 0.09527760744094849, "rewards/rejected": -0.14036527276039124, "step": 270 }, { "epoch": 0.14652014652014653, "grad_norm": 83.7095309974128, "learning_rate": 2.439024390243902e-07, "logits/chosen": -3.030277729034424, "logits/rejected": -3.0503933429718018, "logps/chosen": -305.9091796875, "logps/rejected": -288.1725769042969, "loss": 0.6549, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.09435807168483734, "rewards/margins": 0.21566851437091827, "rewards/rejected": -0.12131045013666153, "step": 280 }, { "epoch": 0.15175300889586604, "grad_norm": 74.48999640503538, "learning_rate": 2.526132404181184e-07, "logits/chosen": -3.1189064979553223, "logits/rejected": -3.014122724533081, "logps/chosen": -322.6457214355469, "logps/rejected": -238.81808471679688, "loss": 0.6438, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.06634673476219177, "rewards/margins": 0.2049088180065155, "rewards/rejected": -0.13856211304664612, "step": 290 }, { "epoch": 0.15698587127158556, "grad_norm": 79.58925715876667, "learning_rate": 2.613240418118467e-07, "logits/chosen": -3.0929694175720215, "logits/rejected": -3.0530850887298584, "logps/chosen": -314.437255859375, "logps/rejected": -255.65133666992188, "loss": 0.6311, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05479121208190918, "rewards/margins": 0.1390453279018402, "rewards/rejected": -0.1938365250825882, "step": 300 }, { "epoch": 0.16221873364730507, "grad_norm": 67.41900708173915, "learning_rate": 2.700348432055749e-07, "logits/chosen": -3.1495442390441895, "logits/rejected": -3.0486819744110107, "logps/chosen": -319.1748046875, "logps/rejected": -297.1847229003906, "loss": 0.635, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.18144665658473969, "rewards/margins": 0.32447656989097595, "rewards/rejected": -0.14302992820739746, "step": 310 }, { "epoch": 0.1674515960230246, "grad_norm": 82.82523786930675, "learning_rate": 2.7874564459930313e-07, "logits/chosen": -3.1440889835357666, "logits/rejected": -3.006028413772583, "logps/chosen": -332.3525085449219, "logps/rejected": -224.18563842773438, "loss": 0.6321, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0683496817946434, "rewards/margins": 0.18532302975654602, "rewards/rejected": -0.25367271900177, "step": 320 }, { "epoch": 0.1726844583987441, "grad_norm": 83.1474754180395, "learning_rate": 2.874564459930314e-07, "logits/chosen": -3.002368450164795, "logits/rejected": -2.8743152618408203, "logps/chosen": -252.5076141357422, "logps/rejected": -197.36868286132812, "loss": 0.6345, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01674083061516285, "rewards/margins": 0.19254377484321594, "rewards/rejected": -0.20928461849689484, "step": 330 }, { "epoch": 0.17791732077446362, "grad_norm": 87.0952478038699, "learning_rate": 2.961672473867596e-07, "logits/chosen": -3.096506118774414, "logits/rejected": -3.098784923553467, "logps/chosen": -278.7817077636719, "logps/rejected": -301.18505859375, "loss": 0.6035, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.05996248871088028, "rewards/margins": 0.24388769268989563, "rewards/rejected": -0.18392518162727356, "step": 340 }, { "epoch": 0.18315018315018314, "grad_norm": 78.08090201069898, "learning_rate": 3.048780487804878e-07, "logits/chosen": -3.0234322547912598, "logits/rejected": -3.0858261585235596, "logps/chosen": -185.2747039794922, "logps/rejected": -221.8296661376953, "loss": 0.5997, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.02042878046631813, "rewards/margins": 0.3977828621864319, "rewards/rejected": -0.37735408544540405, "step": 350 }, { "epoch": 0.18838304552590268, "grad_norm": 70.43016798731578, "learning_rate": 3.13588850174216e-07, "logits/chosen": -3.0982182025909424, "logits/rejected": -3.0375752449035645, "logps/chosen": -264.559326171875, "logps/rejected": -272.41278076171875, "loss": 0.6105, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.012140035629272461, "rewards/margins": 0.26558828353881836, "rewards/rejected": -0.2777283787727356, "step": 360 }, { "epoch": 0.1936159079016222, "grad_norm": 80.18156332926674, "learning_rate": 3.2229965156794425e-07, "logits/chosen": -3.053605556488037, "logits/rejected": -3.013339042663574, "logps/chosen": -294.3258361816406, "logps/rejected": -267.54864501953125, "loss": 0.6023, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.10063071548938751, "rewards/margins": 0.3967227041721344, "rewards/rejected": -0.4973534047603607, "step": 370 }, { "epoch": 0.1988487702773417, "grad_norm": 75.39302733975107, "learning_rate": 3.3101045296167245e-07, "logits/chosen": -2.9372477531433105, "logits/rejected": -3.007261276245117, "logps/chosen": -202.58828735351562, "logps/rejected": -328.4515075683594, "loss": 0.5998, "rewards/accuracies": 0.75, "rewards/chosen": -0.016352981328964233, "rewards/margins": 0.3992491066455841, "rewards/rejected": -0.41560202836990356, "step": 380 }, { "epoch": 0.20408163265306123, "grad_norm": 72.02519767481071, "learning_rate": 3.3972125435540065e-07, "logits/chosen": -3.080362558364868, "logits/rejected": -2.9892656803131104, "logps/chosen": -385.20989990234375, "logps/rejected": -267.0478515625, "loss": 0.6067, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.14570730924606323, "rewards/margins": 0.25116539001464844, "rewards/rejected": -0.10545806586742401, "step": 390 }, { "epoch": 0.20931449502878074, "grad_norm": 79.37874614458458, "learning_rate": 3.484320557491289e-07, "logits/chosen": -3.1332805156707764, "logits/rejected": -3.0437369346618652, "logps/chosen": -288.7727966308594, "logps/rejected": -246.7441864013672, "loss": 0.6037, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.00869719497859478, "rewards/margins": 0.3724183738231659, "rewards/rejected": -0.36372119188308716, "step": 400 }, { "epoch": 0.21454735740450026, "grad_norm": 72.83460265773226, "learning_rate": 3.5714285714285716e-07, "logits/chosen": -3.041630983352661, "logits/rejected": -3.0279765129089355, "logps/chosen": -279.9696960449219, "logps/rejected": -312.3309631347656, "loss": 0.6299, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.055697523057460785, "rewards/margins": 0.2970433831214905, "rewards/rejected": -0.35274091362953186, "step": 410 }, { "epoch": 0.21978021978021978, "grad_norm": 74.87889644580528, "learning_rate": 3.6585365853658536e-07, "logits/chosen": -3.113696813583374, "logits/rejected": -3.1003146171569824, "logps/chosen": -279.19390869140625, "logps/rejected": -292.80511474609375, "loss": 0.5989, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.16003060340881348, "rewards/margins": 0.12529155611991882, "rewards/rejected": -0.2853221297264099, "step": 420 }, { "epoch": 0.2250130821559393, "grad_norm": 79.12587371102444, "learning_rate": 3.7456445993031356e-07, "logits/chosen": -3.1775436401367188, "logits/rejected": -3.071092128753662, "logps/chosen": -309.3301696777344, "logps/rejected": -248.36001586914062, "loss": 0.5946, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.08730575442314148, "rewards/margins": 0.6287146806716919, "rewards/rejected": -0.541408896446228, "step": 430 }, { "epoch": 0.2302459445316588, "grad_norm": 91.41654315313465, "learning_rate": 3.832752613240418e-07, "logits/chosen": -3.082730770111084, "logits/rejected": -3.057797908782959, "logps/chosen": -272.3341979980469, "logps/rejected": -246.8263397216797, "loss": 0.6023, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.15264281630516052, "rewards/margins": 0.19146516919136047, "rewards/rejected": -0.3441079556941986, "step": 440 }, { "epoch": 0.23547880690737832, "grad_norm": 80.91566422782499, "learning_rate": 3.9198606271777e-07, "logits/chosen": -2.9516100883483887, "logits/rejected": -2.978323459625244, "logps/chosen": -234.4990692138672, "logps/rejected": -242.3379364013672, "loss": 0.5688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.22045119106769562, "rewards/margins": 0.26988354325294495, "rewards/rejected": -0.490334689617157, "step": 450 }, { "epoch": 0.24071166928309787, "grad_norm": 78.17595693435727, "learning_rate": 4.006968641114982e-07, "logits/chosen": -3.0676305294036865, "logits/rejected": -2.976959705352783, "logps/chosen": -286.11016845703125, "logps/rejected": -262.4464416503906, "loss": 0.5956, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.05319100618362427, "rewards/margins": 0.4763426184654236, "rewards/rejected": -0.42315158247947693, "step": 460 }, { "epoch": 0.24594453165881738, "grad_norm": 86.19293300757354, "learning_rate": 4.0940766550522647e-07, "logits/chosen": -3.0641300678253174, "logits/rejected": -3.0458385944366455, "logps/chosen": -286.1731872558594, "logps/rejected": -285.10546875, "loss": 0.5674, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2245810478925705, "rewards/margins": 0.3058464229106903, "rewards/rejected": -0.5304274559020996, "step": 470 }, { "epoch": 0.25117739403453687, "grad_norm": 85.24058124341495, "learning_rate": 4.1811846689895467e-07, "logits/chosen": -3.0900397300720215, "logits/rejected": -3.1040122509002686, "logps/chosen": -240.080322265625, "logps/rejected": -233.39083862304688, "loss": 0.5875, "rewards/accuracies": 0.75, "rewards/chosen": 0.03031245805323124, "rewards/margins": 0.5176481008529663, "rewards/rejected": -0.4873356819152832, "step": 480 }, { "epoch": 0.2564102564102564, "grad_norm": 70.23394462425922, "learning_rate": 4.268292682926829e-07, "logits/chosen": -3.0790655612945557, "logits/rejected": -3.0690650939941406, "logps/chosen": -290.19561767578125, "logps/rejected": -256.7394104003906, "loss": 0.5729, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00747694680467248, "rewards/margins": 0.42387086153030396, "rewards/rejected": -0.4313477873802185, "step": 490 }, { "epoch": 0.2616431187859759, "grad_norm": 81.60434559123149, "learning_rate": 4.3554006968641113e-07, "logits/chosen": -3.0398590564727783, "logits/rejected": -3.05519700050354, "logps/chosen": -257.57269287109375, "logps/rejected": -267.43505859375, "loss": 0.565, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.025076359510421753, "rewards/margins": 0.5047939419746399, "rewards/rejected": -0.5298702120780945, "step": 500 }, { "epoch": 0.2668759811616955, "grad_norm": 83.7367439802792, "learning_rate": 4.442508710801394e-07, "logits/chosen": -3.1394972801208496, "logits/rejected": -3.0476748943328857, "logps/chosen": -285.98052978515625, "logps/rejected": -252.0680694580078, "loss": 0.5774, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.060531772673130035, "rewards/margins": 0.37502020597457886, "rewards/rejected": -0.4355519711971283, "step": 510 }, { "epoch": 0.272108843537415, "grad_norm": 81.1824411857227, "learning_rate": 4.529616724738676e-07, "logits/chosen": -3.100027084350586, "logits/rejected": -3.0924158096313477, "logps/chosen": -266.29864501953125, "logps/rejected": -287.73284912109375, "loss": 0.57, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2206277847290039, "rewards/margins": 0.34208929538726807, "rewards/rejected": -0.5627170205116272, "step": 520 }, { "epoch": 0.2773417059131345, "grad_norm": 68.13102083840998, "learning_rate": 4.616724738675958e-07, "logits/chosen": -3.046821355819702, "logits/rejected": -3.050039529800415, "logps/chosen": -337.9527282714844, "logps/rejected": -250.5797576904297, "loss": 0.5661, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.015325836837291718, "rewards/margins": 0.5823065042495728, "rewards/rejected": -0.5976323485374451, "step": 530 }, { "epoch": 0.282574568288854, "grad_norm": 88.8628594184477, "learning_rate": 4.7038327526132404e-07, "logits/chosen": -3.0438284873962402, "logits/rejected": -3.0154149532318115, "logps/chosen": -296.7005615234375, "logps/rejected": -279.30047607421875, "loss": 0.5442, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2998342514038086, "rewards/margins": 0.44469594955444336, "rewards/rejected": -0.7445300817489624, "step": 540 }, { "epoch": 0.28780743066457354, "grad_norm": 80.2435495742718, "learning_rate": 4.790940766550523e-07, "logits/chosen": -3.146636486053467, "logits/rejected": -3.0604732036590576, "logps/chosen": -333.3040466308594, "logps/rejected": -281.03143310546875, "loss": 0.6168, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1889125257730484, "rewards/margins": 0.4751681387424469, "rewards/rejected": -0.6640806794166565, "step": 550 }, { "epoch": 0.29304029304029305, "grad_norm": 67.18153866656756, "learning_rate": 4.878048780487804e-07, "logits/chosen": -3.1387746334075928, "logits/rejected": -3.103151798248291, "logps/chosen": -286.77410888671875, "logps/rejected": -273.26513671875, "loss": 0.5388, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.003807747270911932, "rewards/margins": 0.7320934534072876, "rewards/rejected": -0.7282857298851013, "step": 560 }, { "epoch": 0.29827315541601257, "grad_norm": 80.6750665584691, "learning_rate": 4.965156794425087e-07, "logits/chosen": -3.183868885040283, "logits/rejected": -3.0901682376861572, "logps/chosen": -282.6393127441406, "logps/rejected": -260.1653747558594, "loss": 0.5752, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.14368876814842224, "rewards/margins": 0.5626433491706848, "rewards/rejected": -0.7063321471214294, "step": 570 }, { "epoch": 0.3035060177917321, "grad_norm": 67.14688234258463, "learning_rate": 4.999983312905696e-07, "logits/chosen": -3.1465394496917725, "logits/rejected": -3.0805275440216064, "logps/chosen": -316.14111328125, "logps/rejected": -221.3306884765625, "loss": 0.5641, "rewards/accuracies": 0.75, "rewards/chosen": -0.2013058215379715, "rewards/margins": 0.510747492313385, "rewards/rejected": -0.7120533585548401, "step": 580 }, { "epoch": 0.3087388801674516, "grad_norm": 77.0347088476438, "learning_rate": 4.999881337025014e-07, "logits/chosen": -2.9915051460266113, "logits/rejected": -2.9998035430908203, "logps/chosen": -219.2443084716797, "logps/rejected": -225.1728515625, "loss": 0.5681, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.29302966594696045, "rewards/margins": 0.32400691509246826, "rewards/rejected": -0.6170366406440735, "step": 590 }, { "epoch": 0.3139717425431711, "grad_norm": 71.19146710177215, "learning_rate": 4.999686659648518e-07, "logits/chosen": -3.059614658355713, "logits/rejected": -3.056039333343506, "logps/chosen": -283.4884338378906, "logps/rejected": -274.4944763183594, "loss": 0.6121, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11409635841846466, "rewards/margins": 0.3892931342124939, "rewards/rejected": -0.5033894777297974, "step": 600 }, { "epoch": 0.31920460491889063, "grad_norm": 72.48350281661597, "learning_rate": 4.999399287995302e-07, "logits/chosen": -3.1106886863708496, "logits/rejected": -3.031641721725464, "logps/chosen": -204.0187530517578, "logps/rejected": -223.49301147460938, "loss": 0.5386, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.11018689721822739, "rewards/margins": 0.5322835445404053, "rewards/rejected": -0.6424704790115356, "step": 610 }, { "epoch": 0.32443746729461015, "grad_norm": 75.91605029090233, "learning_rate": 4.999019232721791e-07, "logits/chosen": -3.138636350631714, "logits/rejected": -2.989506721496582, "logps/chosen": -357.5049133300781, "logps/rejected": -218.5236053466797, "loss": 0.5657, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.006834998726844788, "rewards/margins": 0.828178882598877, "rewards/rejected": -0.8350139856338501, "step": 620 }, { "epoch": 0.32967032967032966, "grad_norm": 65.03245289553065, "learning_rate": 4.998546507921325e-07, "logits/chosen": -3.031207799911499, "logits/rejected": -3.0436060428619385, "logps/chosen": -226.6817169189453, "logps/rejected": -272.5300598144531, "loss": 0.6027, "rewards/accuracies": 0.75, "rewards/chosen": -0.31320852041244507, "rewards/margins": 0.566818356513977, "rewards/rejected": -0.8800268173217773, "step": 630 }, { "epoch": 0.3349031920460492, "grad_norm": 76.01689765086894, "learning_rate": 4.997981131123656e-07, "logits/chosen": -3.1130642890930176, "logits/rejected": -3.045344829559326, "logps/chosen": -285.4091491699219, "logps/rejected": -298.74029541015625, "loss": 0.5486, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.07961050420999527, "rewards/margins": 0.9183964729309082, "rewards/rejected": -0.8387860059738159, "step": 640 }, { "epoch": 0.3401360544217687, "grad_norm": 69.61175053946748, "learning_rate": 4.997323123294291e-07, "logits/chosen": -3.0745339393615723, "logits/rejected": -3.0372300148010254, "logps/chosen": -266.2978210449219, "logps/rejected": -249.9000244140625, "loss": 0.5732, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.16728679835796356, "rewards/margins": 0.6416473388671875, "rewards/rejected": -0.808934211730957, "step": 650 }, { "epoch": 0.3453689167974882, "grad_norm": 71.6004514039293, "learning_rate": 4.99657250883371e-07, "logits/chosen": -3.0590624809265137, "logits/rejected": -3.024073600769043, "logps/chosen": -238.9779815673828, "logps/rejected": -230.164794921875, "loss": 0.5552, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3022328019142151, "rewards/margins": 0.4109364449977875, "rewards/rejected": -0.7131692171096802, "step": 660 }, { "epoch": 0.35060177917320773, "grad_norm": 87.15159403450654, "learning_rate": 4.995729315576468e-07, "logits/chosen": -2.9912431240081787, "logits/rejected": -2.973342180252075, "logps/chosen": -268.73626708984375, "logps/rejected": -240.23580932617188, "loss": 0.5695, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.43058228492736816, "rewards/margins": 0.4866998791694641, "rewards/rejected": -0.9172821044921875, "step": 670 }, { "epoch": 0.35583464154892724, "grad_norm": 68.23445396098836, "learning_rate": 4.99479357479016e-07, "logits/chosen": -2.926633834838867, "logits/rejected": -2.8903262615203857, "logps/chosen": -246.643798828125, "logps/rejected": -225.0759735107422, "loss": 0.5635, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5457836389541626, "rewards/margins": 0.46917659044265747, "rewards/rejected": -1.0149601697921753, "step": 680 }, { "epoch": 0.36106750392464676, "grad_norm": 66.69123159299576, "learning_rate": 4.993765321174261e-07, "logits/chosen": -3.0723910331726074, "logits/rejected": -2.9905731678009033, "logps/chosen": -243.8313751220703, "logps/rejected": -234.4405517578125, "loss": 0.5151, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1179482564330101, "rewards/margins": 0.7535763382911682, "rewards/rejected": -0.8715246319770813, "step": 690 }, { "epoch": 0.3663003663003663, "grad_norm": 73.94753173397896, "learning_rate": 4.992644592858842e-07, "logits/chosen": -2.992800712585449, "logits/rejected": -2.975149393081665, "logps/chosen": -257.9360656738281, "logps/rejected": -243.1983184814453, "loss": 0.5643, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3617693781852722, "rewards/margins": 0.5574807524681091, "rewards/rejected": -0.9192501902580261, "step": 700 }, { "epoch": 0.3715332286760858, "grad_norm": 78.41697781365677, "learning_rate": 4.991431431403148e-07, "logits/chosen": -3.043755292892456, "logits/rejected": -2.9603829383850098, "logps/chosen": -332.0948181152344, "logps/rejected": -284.07000732421875, "loss": 0.5063, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.16277877986431122, "rewards/margins": 1.051615834236145, "rewards/rejected": -1.2143945693969727, "step": 710 }, { "epoch": 0.37676609105180536, "grad_norm": 74.60093457707843, "learning_rate": 4.99012588179407e-07, "logits/chosen": -2.9376773834228516, "logits/rejected": -2.9894323348999023, "logps/chosen": -217.18783569335938, "logps/rejected": -233.0116424560547, "loss": 0.5325, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3099881112575531, "rewards/margins": 0.6255155205726624, "rewards/rejected": -0.9355036020278931, "step": 720 }, { "epoch": 0.3819989534275249, "grad_norm": 80.76623841908768, "learning_rate": 4.988727992444467e-07, "logits/chosen": -3.0136430263519287, "logits/rejected": -3.018346071243286, "logps/chosen": -265.22015380859375, "logps/rejected": -302.9878845214844, "loss": 0.5719, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4464147090911865, "rewards/margins": 0.9138778448104858, "rewards/rejected": -1.360292673110962, "step": 730 }, { "epoch": 0.3872318158032444, "grad_norm": 77.17660503966651, "learning_rate": 4.98723781519137e-07, "logits/chosen": -2.9706077575683594, "logits/rejected": -3.003307342529297, "logps/chosen": -240.26412963867188, "logps/rejected": -227.24441528320312, "loss": 0.5155, "rewards/accuracies": 0.625, "rewards/chosen": -0.281870037317276, "rewards/margins": 0.6201066374778748, "rewards/rejected": -0.9019767642021179, "step": 740 }, { "epoch": 0.3924646781789639, "grad_norm": 79.48533509215376, "learning_rate": 4.98565540529407e-07, "logits/chosen": -2.9490303993225098, "logits/rejected": -2.902139663696289, "logps/chosen": -281.72222900390625, "logps/rejected": -301.08087158203125, "loss": 0.5157, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.19064797461032867, "rewards/margins": 0.4730730950832367, "rewards/rejected": -0.6637210845947266, "step": 750 }, { "epoch": 0.3976975405546834, "grad_norm": 79.26027669615571, "learning_rate": 4.983980821432054e-07, "logits/chosen": -2.982775926589966, "logits/rejected": -2.94804048538208, "logps/chosen": -231.5618133544922, "logps/rejected": -201.54385375976562, "loss": 0.568, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3758910596370697, "rewards/margins": 0.62889164686203, "rewards/rejected": -1.004782795906067, "step": 760 }, { "epoch": 0.40293040293040294, "grad_norm": 66.19890022872369, "learning_rate": 4.982214125702845e-07, "logits/chosen": -2.9724113941192627, "logits/rejected": -2.9496588706970215, "logps/chosen": -245.07223510742188, "logps/rejected": -286.54730224609375, "loss": 0.5962, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5642284750938416, "rewards/margins": 0.646499514579773, "rewards/rejected": -1.2107279300689697, "step": 770 }, { "epoch": 0.40816326530612246, "grad_norm": 88.5236036999365, "learning_rate": 4.980355383619684e-07, "logits/chosen": -2.9540507793426514, "logits/rejected": -2.9377801418304443, "logps/chosen": -241.74972534179688, "logps/rejected": -213.3856964111328, "loss": 0.5188, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.55348801612854, "rewards/margins": 0.786832332611084, "rewards/rejected": -1.3403204679489136, "step": 780 }, { "epoch": 0.413396127681842, "grad_norm": 78.46836433701533, "learning_rate": 4.978404664109113e-07, "logits/chosen": -2.9502434730529785, "logits/rejected": -2.961839437484741, "logps/chosen": -228.2624053955078, "logps/rejected": -292.4291076660156, "loss": 0.5492, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7488000392913818, "rewards/margins": 0.3036850392818451, "rewards/rejected": -1.0524849891662598, "step": 790 }, { "epoch": 0.4186289900575615, "grad_norm": 90.50441582711495, "learning_rate": 4.97636203950841e-07, "logits/chosen": -2.975677967071533, "logits/rejected": -2.9745824337005615, "logps/chosen": -294.22259521484375, "logps/rejected": -306.77655029296875, "loss": 0.5651, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3741792142391205, "rewards/margins": 0.8347808718681335, "rewards/rejected": -1.2089600563049316, "step": 800 }, { "epoch": 0.423861852433281, "grad_norm": 75.99537579686852, "learning_rate": 4.974227585562916e-07, "logits/chosen": -2.9471447467803955, "logits/rejected": -2.8836045265197754, "logps/chosen": -308.07562255859375, "logps/rejected": -272.5606994628906, "loss": 0.5569, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6879116296768188, "rewards/margins": 0.6535586714744568, "rewards/rejected": -1.3414703607559204, "step": 810 }, { "epoch": 0.4290947148090005, "grad_norm": 66.47496302733676, "learning_rate": 4.972001381423214e-07, "logits/chosen": -2.993971824645996, "logits/rejected": -2.9475131034851074, "logps/chosen": -288.21875, "logps/rejected": -241.2631072998047, "loss": 0.5632, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7147637605667114, "rewards/margins": 0.669032871723175, "rewards/rejected": -1.3837965726852417, "step": 820 }, { "epoch": 0.43432757718472004, "grad_norm": 76.49963981838697, "learning_rate": 4.969683509642206e-07, "logits/chosen": -3.0710947513580322, "logits/rejected": -3.0182104110717773, "logps/chosen": -240.58169555664062, "logps/rejected": -239.9686737060547, "loss": 0.6107, "rewards/accuracies": 0.75, "rewards/chosen": -0.6025031805038452, "rewards/margins": 0.5341706275939941, "rewards/rejected": -1.1366736888885498, "step": 830 }, { "epoch": 0.43956043956043955, "grad_norm": 66.16061472087422, "learning_rate": 4.967274056172044e-07, "logits/chosen": -3.091104507446289, "logits/rejected": -2.9127261638641357, "logps/chosen": -402.7655944824219, "logps/rejected": -294.42840576171875, "loss": 0.5316, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.46069592237472534, "rewards/margins": 0.8657759428024292, "rewards/rejected": -1.3264718055725098, "step": 840 }, { "epoch": 0.44479330193615907, "grad_norm": 89.68977708024659, "learning_rate": 4.964773110360944e-07, "logits/chosen": -2.9921536445617676, "logits/rejected": -2.882133960723877, "logps/chosen": -255.71420288085938, "logps/rejected": -225.920166015625, "loss": 0.5934, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6226187944412231, "rewards/margins": 0.5147127509117126, "rewards/rejected": -1.1373313665390015, "step": 850 }, { "epoch": 0.4500261643118786, "grad_norm": 84.89815221532746, "learning_rate": 4.962180764949876e-07, "logits/chosen": -3.0228590965270996, "logits/rejected": -3.014902114868164, "logps/chosen": -188.35711669921875, "logps/rejected": -265.97686767578125, "loss": 0.5285, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.24164557456970215, "rewards/margins": 1.1571402549743652, "rewards/rejected": -1.3987857103347778, "step": 860 }, { "epoch": 0.4552590266875981, "grad_norm": 58.02073925371317, "learning_rate": 4.959497116069122e-07, "logits/chosen": -2.760188341140747, "logits/rejected": -2.8085579872131348, "logps/chosen": -225.26992797851562, "logps/rejected": -235.1295928955078, "loss": 0.5562, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5024533867835999, "rewards/margins": 0.6750583052635193, "rewards/rejected": -1.1775115728378296, "step": 870 }, { "epoch": 0.4604918890633176, "grad_norm": 76.88865668896253, "learning_rate": 4.956722263234711e-07, "logits/chosen": -3.0085296630859375, "logits/rejected": -2.9894707202911377, "logps/chosen": -272.3066101074219, "logps/rejected": -254.1557159423828, "loss": 0.5248, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5375632047653198, "rewards/margins": 0.5188736915588379, "rewards/rejected": -1.0564368963241577, "step": 880 }, { "epoch": 0.46572475143903713, "grad_norm": 81.86660723876152, "learning_rate": 4.95385630934473e-07, "logits/chosen": -3.0461716651916504, "logits/rejected": -3.013216495513916, "logps/chosen": -309.3580627441406, "logps/rejected": -241.667724609375, "loss": 0.525, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2827950716018677, "rewards/margins": 0.7667288184165955, "rewards/rejected": -1.0495238304138184, "step": 890 }, { "epoch": 0.47095761381475665, "grad_norm": 106.67595666488997, "learning_rate": 4.950899360675511e-07, "logits/chosen": -2.9302525520324707, "logits/rejected": -2.9032442569732666, "logps/chosen": -244.11312866210938, "logps/rejected": -297.0611267089844, "loss": 0.5399, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.47185808420181274, "rewards/margins": 1.127158284187317, "rewards/rejected": -1.5990164279937744, "step": 900 }, { "epoch": 0.47619047619047616, "grad_norm": 91.72582622227273, "learning_rate": 4.947851526877681e-07, "logits/chosen": -2.956503391265869, "logits/rejected": -2.9149558544158936, "logps/chosen": -177.20069885253906, "logps/rejected": -193.9868927001953, "loss": 0.5584, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3943067491054535, "rewards/margins": 1.0875154733657837, "rewards/rejected": -1.4818222522735596, "step": 910 }, { "epoch": 0.48142333856619574, "grad_norm": 73.93245165166891, "learning_rate": 4.944712920972108e-07, "logits/chosen": -3.071528434753418, "logits/rejected": -2.975660800933838, "logps/chosen": -305.67828369140625, "logps/rejected": -254.0524444580078, "loss": 0.5508, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5671824216842651, "rewards/margins": 0.6277979016304016, "rewards/rejected": -1.1949803829193115, "step": 920 }, { "epoch": 0.48665620094191525, "grad_norm": 83.95369230554853, "learning_rate": 4.9414836593457e-07, "logits/chosen": -2.970364809036255, "logits/rejected": -2.9623026847839355, "logps/chosen": -266.434814453125, "logps/rejected": -266.0400085449219, "loss": 0.5444, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.625787079334259, "rewards/margins": 0.5239383578300476, "rewards/rejected": -1.1497254371643066, "step": 930 }, { "epoch": 0.49188906331763477, "grad_norm": 65.82877685759762, "learning_rate": 4.938163861747094e-07, "logits/chosen": -3.0349297523498535, "logits/rejected": -2.973889112472534, "logps/chosen": -296.0238952636719, "logps/rejected": -249.9376678466797, "loss": 0.4844, "rewards/accuracies": 0.75, "rewards/chosen": -0.4766687750816345, "rewards/margins": 0.9457138180732727, "rewards/rejected": -1.4223825931549072, "step": 940 }, { "epoch": 0.4971219256933543, "grad_norm": 77.98118246156477, "learning_rate": 4.934753651282215e-07, "logits/chosen": -2.953591823577881, "logits/rejected": -2.8531081676483154, "logps/chosen": -289.6026306152344, "logps/rejected": -282.7614440917969, "loss": 0.501, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4342326521873474, "rewards/margins": 1.001392126083374, "rewards/rejected": -1.4356248378753662, "step": 950 }, { "epoch": 0.5023547880690737, "grad_norm": 78.10856006111759, "learning_rate": 4.93125315440971e-07, "logits/chosen": -3.0429673194885254, "logits/rejected": -2.9622342586517334, "logps/chosen": -283.93023681640625, "logps/rejected": -276.83837890625, "loss": 0.5518, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6738978028297424, "rewards/margins": 0.7615943551063538, "rewards/rejected": -1.4354921579360962, "step": 960 }, { "epoch": 0.5075876504447933, "grad_norm": 82.39503995101273, "learning_rate": 4.92766250093626e-07, "logits/chosen": -2.9338812828063965, "logits/rejected": -2.881221294403076, "logps/chosen": -293.97320556640625, "logps/rejected": -265.17303466796875, "loss": 0.5512, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.623167872428894, "rewards/margins": 1.4571809768676758, "rewards/rejected": -2.0803489685058594, "step": 970 }, { "epoch": 0.5128205128205128, "grad_norm": 47.85766646668482, "learning_rate": 4.92398182401176e-07, "logits/chosen": -3.041491985321045, "logits/rejected": -2.8856053352355957, "logps/chosen": -309.5716247558594, "logps/rejected": -249.63027954101562, "loss": 0.5269, "rewards/accuracies": 0.875, "rewards/chosen": -0.227116197347641, "rewards/margins": 1.1038864850997925, "rewards/rejected": -1.3310027122497559, "step": 980 }, { "epoch": 0.5180533751962323, "grad_norm": 65.15527522614154, "learning_rate": 4.920211260124395e-07, "logits/chosen": -2.9373934268951416, "logits/rejected": -2.8702340126037598, "logps/chosen": -248.91824340820312, "logps/rejected": -231.93948364257812, "loss": 0.537, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5078210830688477, "rewards/margins": 0.7554989457130432, "rewards/rejected": -1.2633198499679565, "step": 990 }, { "epoch": 0.5232862375719518, "grad_norm": 83.57048421420163, "learning_rate": 4.916350949095566e-07, "logits/chosen": -2.9579670429229736, "logits/rejected": -2.903033971786499, "logps/chosen": -237.2493896484375, "logps/rejected": -236.36135864257812, "loss": 0.5596, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7000842690467834, "rewards/margins": 0.8343229293823242, "rewards/rejected": -1.5344072580337524, "step": 1000 }, { "epoch": 0.5285190999476713, "grad_norm": 68.25358109838857, "learning_rate": 4.912401034074708e-07, "logits/chosen": -2.9460067749023438, "logits/rejected": -2.9346060752868652, "logps/chosen": -237.12600708007812, "logps/rejected": -263.20281982421875, "loss": 0.5363, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5221272110939026, "rewards/margins": 0.8145877718925476, "rewards/rejected": -1.3367151021957397, "step": 1010 }, { "epoch": 0.533751962323391, "grad_norm": 66.437862451837, "learning_rate": 4.908361661533989e-07, "logits/chosen": -2.987467050552368, "logits/rejected": -2.964747190475464, "logps/chosen": -295.523193359375, "logps/rejected": -258.8580322265625, "loss": 0.5099, "rewards/accuracies": 0.75, "rewards/chosen": -0.42180687189102173, "rewards/margins": 0.9707552194595337, "rewards/rejected": -1.3925621509552002, "step": 1020 }, { "epoch": 0.5389848246991105, "grad_norm": 77.97526238104945, "learning_rate": 4.904232981262866e-07, "logits/chosen": -2.967543363571167, "logits/rejected": -2.9276556968688965, "logps/chosen": -263.80084228515625, "logps/rejected": -227.8192901611328, "loss": 0.5703, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9123917818069458, "rewards/margins": 0.39597734808921814, "rewards/rejected": -1.3083692789077759, "step": 1030 }, { "epoch": 0.54421768707483, "grad_norm": 69.27666447998746, "learning_rate": 4.900015146362544e-07, "logits/chosen": -2.9868767261505127, "logits/rejected": -3.0447587966918945, "logps/chosen": -255.80490112304688, "logps/rejected": -272.26617431640625, "loss": 0.5649, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7015228867530823, "rewards/margins": 0.6462124586105347, "rewards/rejected": -1.3477352857589722, "step": 1040 }, { "epoch": 0.5494505494505495, "grad_norm": 83.76904208085551, "learning_rate": 4.895708313240285e-07, "logits/chosen": -3.0519447326660156, "logits/rejected": -2.9641642570495605, "logps/chosen": -330.5013732910156, "logps/rejected": -301.7545471191406, "loss": 0.5405, "rewards/accuracies": 0.875, "rewards/chosen": -0.31569328904151917, "rewards/margins": 1.1219208240509033, "rewards/rejected": -1.4376142024993896, "step": 1050 }, { "epoch": 0.554683411826269, "grad_norm": 75.32188639245354, "learning_rate": 4.891312641603623e-07, "logits/chosen": -2.940150499343872, "logits/rejected": -2.952392101287842, "logps/chosen": -265.26153564453125, "logps/rejected": -276.7397766113281, "loss": 0.5376, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.22543549537658691, "rewards/margins": 1.093856692314148, "rewards/rejected": -1.3192921876907349, "step": 1060 }, { "epoch": 0.5599162742019885, "grad_norm": 62.73832035870637, "learning_rate": 4.886828294454426e-07, "logits/chosen": -2.9634833335876465, "logits/rejected": -2.9711625576019287, "logps/chosen": -327.64093017578125, "logps/rejected": -279.9158020019531, "loss": 0.5369, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.30215874314308167, "rewards/margins": 0.8274700045585632, "rewards/rejected": -1.1296287775039673, "step": 1070 }, { "epoch": 0.565149136577708, "grad_norm": 78.98339971057405, "learning_rate": 4.882255438082863e-07, "logits/chosen": -3.0260062217712402, "logits/rejected": -2.963953733444214, "logps/chosen": -235.6061248779297, "logps/rejected": -237.56729125976562, "loss": 0.565, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4653490483760834, "rewards/margins": 0.805554986000061, "rewards/rejected": -1.2709039449691772, "step": 1080 }, { "epoch": 0.5703819989534276, "grad_norm": 108.89811133405777, "learning_rate": 4.877594242061233e-07, "logits/chosen": -2.9574666023254395, "logits/rejected": -2.8617453575134277, "logps/chosen": -310.12091064453125, "logps/rejected": -199.87904357910156, "loss": 0.5725, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7315424680709839, "rewards/margins": 0.4507519602775574, "rewards/rejected": -1.1822943687438965, "step": 1090 }, { "epoch": 0.5756148613291471, "grad_norm": 69.88916509396067, "learning_rate": 4.87284487923768e-07, "logits/chosen": -2.9507219791412354, "logits/rejected": -2.9084863662719727, "logps/chosen": -264.9752197265625, "logps/rejected": -285.38287353515625, "loss": 0.4945, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6141699552536011, "rewards/margins": 0.8789398074150085, "rewards/rejected": -1.4931097030639648, "step": 1100 }, { "epoch": 0.5808477237048666, "grad_norm": 81.81242367664213, "learning_rate": 4.868007525729775e-07, "logits/chosen": -2.7796707153320312, "logits/rejected": -2.7759785652160645, "logps/chosen": -177.02542114257812, "logps/rejected": -210.91336059570312, "loss": 0.5755, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4962421953678131, "rewards/margins": 0.9752906560897827, "rewards/rejected": -1.471532940864563, "step": 1110 }, { "epoch": 0.5860805860805861, "grad_norm": 87.85096043300314, "learning_rate": 4.863082360917998e-07, "logits/chosen": -2.933162212371826, "logits/rejected": -2.895423412322998, "logps/chosen": -282.5852355957031, "logps/rejected": -262.8430480957031, "loss": 0.516, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5946338176727295, "rewards/margins": 0.7334874868392944, "rewards/rejected": -1.328121304512024, "step": 1120 }, { "epoch": 0.5913134484563056, "grad_norm": 83.68542865616627, "learning_rate": 4.858069567439072e-07, "logits/chosen": -2.8797500133514404, "logits/rejected": -2.846172332763672, "logps/chosen": -225.96920776367188, "logps/rejected": -276.5212097167969, "loss": 0.5502, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8813258409500122, "rewards/margins": 0.7852452993392944, "rewards/rejected": -1.666571021080017, "step": 1130 }, { "epoch": 0.5965463108320251, "grad_norm": 81.7236018656563, "learning_rate": 4.852969331179206e-07, "logits/chosen": -3.077228546142578, "logits/rejected": -3.048466444015503, "logps/chosen": -261.5570373535156, "logps/rejected": -280.5361022949219, "loss": 0.5162, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4124987721443176, "rewards/margins": 0.8712862133979797, "rewards/rejected": -1.2837848663330078, "step": 1140 }, { "epoch": 0.6017791732077447, "grad_norm": 94.16951758457155, "learning_rate": 4.847781841267185e-07, "logits/chosen": -3.0417542457580566, "logits/rejected": -2.89493465423584, "logps/chosen": -273.8265075683594, "logps/rejected": -245.3599853515625, "loss": 0.5382, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5777983665466309, "rewards/margins": 0.7997044920921326, "rewards/rejected": -1.3775030374526978, "step": 1150 }, { "epoch": 0.6070120355834642, "grad_norm": 78.34554276806884, "learning_rate": 4.842507290067374e-07, "logits/chosen": -2.858222484588623, "logits/rejected": -2.8807895183563232, "logps/chosen": -223.3139190673828, "logps/rejected": -194.02120971679688, "loss": 0.5436, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.730209469795227, "rewards/margins": 0.4390241503715515, "rewards/rejected": -1.1692335605621338, "step": 1160 }, { "epoch": 0.6122448979591837, "grad_norm": 89.12239373605186, "learning_rate": 4.837145873172567e-07, "logits/chosen": -2.975914716720581, "logits/rejected": -2.9190592765808105, "logps/chosen": -273.77850341796875, "logps/rejected": -289.54278564453125, "loss": 0.5469, "rewards/accuracies": 0.875, "rewards/chosen": -0.31983527541160583, "rewards/margins": 1.4057228565216064, "rewards/rejected": -1.7255580425262451, "step": 1170 }, { "epoch": 0.6174777603349032, "grad_norm": 58.774920532061635, "learning_rate": 4.83169778939675e-07, "logits/chosen": -3.0118963718414307, "logits/rejected": -2.9709439277648926, "logps/chosen": -316.74859619140625, "logps/rejected": -277.0160217285156, "loss": 0.4971, "rewards/accuracies": 0.625, "rewards/chosen": -0.5097704529762268, "rewards/margins": 0.5486369132995605, "rewards/rejected": -1.0584074258804321, "step": 1180 }, { "epoch": 0.6227106227106227, "grad_norm": 80.1814153218146, "learning_rate": 4.826163240767716e-07, "logits/chosen": -3.020421028137207, "logits/rejected": -2.9691126346588135, "logps/chosen": -364.55377197265625, "logps/rejected": -281.9911193847656, "loss": 0.4906, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2343555986881256, "rewards/margins": 0.8031519055366516, "rewards/rejected": -1.0375072956085205, "step": 1190 }, { "epoch": 0.6279434850863422, "grad_norm": 63.6917820771556, "learning_rate": 4.820542432519584e-07, "logits/chosen": -2.7919085025787354, "logits/rejected": -2.7088217735290527, "logps/chosen": -305.8468322753906, "logps/rejected": -279.18450927734375, "loss": 0.4984, "rewards/accuracies": 0.75, "rewards/chosen": -0.6469639539718628, "rewards/margins": 0.7688127160072327, "rewards/rejected": -1.4157766103744507, "step": 1200 }, { "epoch": 0.6331763474620618, "grad_norm": 59.04746467065281, "learning_rate": 4.814835573085176e-07, "logits/chosen": -3.048043727874756, "logits/rejected": -3.00614070892334, "logps/chosen": -296.6534729003906, "logps/rejected": -269.0278625488281, "loss": 0.5679, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5373876094818115, "rewards/margins": 0.8512622117996216, "rewards/rejected": -1.3886497020721436, "step": 1210 }, { "epoch": 0.6384092098377813, "grad_norm": 87.35276439134873, "learning_rate": 4.809042874088304e-07, "logits/chosen": -3.023423671722412, "logits/rejected": -3.006147861480713, "logps/chosen": -322.83477783203125, "logps/rejected": -293.1351318359375, "loss": 0.5198, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5859644412994385, "rewards/margins": 1.0388462543487549, "rewards/rejected": -1.624810814857483, "step": 1220 }, { "epoch": 0.6436420722135008, "grad_norm": 73.00992431642005, "learning_rate": 4.803164550335905e-07, "logits/chosen": -2.944314479827881, "logits/rejected": -2.86702823638916, "logps/chosen": -345.80206298828125, "logps/rejected": -249.5872802734375, "loss": 0.5128, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6501429080963135, "rewards/margins": 1.2380462884902954, "rewards/rejected": -1.8881893157958984, "step": 1230 }, { "epoch": 0.6488749345892203, "grad_norm": 61.87334261081398, "learning_rate": 4.797200819810089e-07, "logits/chosen": -2.9559264183044434, "logits/rejected": -2.9394545555114746, "logps/chosen": -243.69448852539062, "logps/rejected": -221.04953002929688, "loss": 0.523, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7260545492172241, "rewards/margins": 0.5649474859237671, "rewards/rejected": -1.2910020351409912, "step": 1240 }, { "epoch": 0.6541077969649398, "grad_norm": 67.88566788161202, "learning_rate": 4.79115190366005e-07, "logits/chosen": -3.0303282737731934, "logits/rejected": -2.957993984222412, "logps/chosen": -274.4369201660156, "logps/rejected": -295.88482666015625, "loss": 0.5237, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5227054357528687, "rewards/margins": 1.0243370532989502, "rewards/rejected": -1.5470424890518188, "step": 1250 }, { "epoch": 0.6593406593406593, "grad_norm": 63.2997444228824, "learning_rate": 4.785018026193862e-07, "logits/chosen": -3.0001285076141357, "logits/rejected": -2.9680237770080566, "logps/chosen": -268.04327392578125, "logps/rejected": -187.63623046875, "loss": 0.5078, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6625667810440063, "rewards/margins": 0.8629196286201477, "rewards/rejected": -1.5254864692687988, "step": 1260 }, { "epoch": 0.6645735217163788, "grad_norm": 66.10417386556118, "learning_rate": 4.77879941487017e-07, "logits/chosen": -2.9719300270080566, "logits/rejected": -2.9214959144592285, "logps/chosen": -237.02554321289062, "logps/rejected": -223.27099609375, "loss": 0.5234, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6837262511253357, "rewards/margins": 1.0004751682281494, "rewards/rejected": -1.6842014789581299, "step": 1270 }, { "epoch": 0.6698063840920984, "grad_norm": 85.1695776739884, "learning_rate": 4.772496300289748e-07, "logits/chosen": -2.977997303009033, "logits/rejected": -2.8911023139953613, "logps/chosen": -249.96304321289062, "logps/rejected": -229.65478515625, "loss": 0.5194, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.897860050201416, "rewards/margins": 1.0484263896942139, "rewards/rejected": -1.9462862014770508, "step": 1280 }, { "epoch": 0.6750392464678179, "grad_norm": 73.8987603380274, "learning_rate": 4.766108916186949e-07, "logits/chosen": -2.930464744567871, "logits/rejected": -2.9195492267608643, "logps/chosen": -245.0294189453125, "logps/rejected": -290.33062744140625, "loss": 0.5474, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.45957422256469727, "rewards/margins": 1.2213318347930908, "rewards/rejected": -1.680906057357788, "step": 1290 }, { "epoch": 0.6802721088435374, "grad_norm": 73.54141662093646, "learning_rate": 4.759637499421042e-07, "logits/chosen": -2.946112871170044, "logits/rejected": -2.9685606956481934, "logps/chosen": -272.72637939453125, "logps/rejected": -292.61383056640625, "loss": 0.4939, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6853058338165283, "rewards/margins": 0.9047178030014038, "rewards/rejected": -1.5900235176086426, "step": 1300 }, { "epoch": 0.6855049712192569, "grad_norm": 75.0289432191627, "learning_rate": 4.7530822899674207e-07, "logits/chosen": -3.063873052597046, "logits/rejected": -3.042912006378174, "logps/chosen": -247.4326934814453, "logps/rejected": -220.4298858642578, "loss": 0.492, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.4876191020011902, "rewards/margins": 1.2090885639190674, "rewards/rejected": -1.6967074871063232, "step": 1310 }, { "epoch": 0.6907378335949764, "grad_norm": 86.19596030494768, "learning_rate": 4.7464435309087137e-07, "logits/chosen": -2.973191738128662, "logits/rejected": -2.9697415828704834, "logps/chosen": -292.7425231933594, "logps/rejected": -303.4023742675781, "loss": 0.4962, "rewards/accuracies": 0.75, "rewards/chosen": -0.8378297090530396, "rewards/margins": 0.7197774648666382, "rewards/rejected": -1.5576070547103882, "step": 1320 }, { "epoch": 0.6959706959706959, "grad_norm": 83.68593090258388, "learning_rate": 4.739721468425763e-07, "logits/chosen": -2.9693474769592285, "logits/rejected": -2.9703445434570312, "logps/chosen": -271.01312255859375, "logps/rejected": -314.56719970703125, "loss": 0.477, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.42655324935913086, "rewards/margins": 1.39058256149292, "rewards/rejected": -1.8171360492706299, "step": 1330 }, { "epoch": 0.7012035583464155, "grad_norm": 59.17309959850415, "learning_rate": 4.7329163517885e-07, "logits/chosen": -2.954552173614502, "logits/rejected": -2.8793694972991943, "logps/chosen": -265.72027587890625, "logps/rejected": -221.7182159423828, "loss": 0.477, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5666095018386841, "rewards/margins": 1.0562241077423096, "rewards/rejected": -1.6228336095809937, "step": 1340 }, { "epoch": 0.706436420722135, "grad_norm": 75.99518546990654, "learning_rate": 4.7260284333466973e-07, "logits/chosen": -3.0229430198669434, "logits/rejected": -2.999329090118408, "logps/chosen": -290.48004150390625, "logps/rejected": -263.2909851074219, "loss": 0.5272, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6663414239883423, "rewards/margins": 0.8637315034866333, "rewards/rejected": -1.5300730466842651, "step": 1350 }, { "epoch": 0.7116692830978545, "grad_norm": 88.9137514259657, "learning_rate": 4.719057968520617e-07, "logits/chosen": -2.8987345695495605, "logits/rejected": -2.8744430541992188, "logps/chosen": -339.7561340332031, "logps/rejected": -313.1120300292969, "loss": 0.5714, "rewards/accuracies": 0.625, "rewards/chosen": -0.8896106481552124, "rewards/margins": 0.7412205338478088, "rewards/rejected": -1.6308311223983765, "step": 1360 }, { "epoch": 0.716902145473574, "grad_norm": 87.44356948378021, "learning_rate": 4.7120052157915345e-07, "logits/chosen": -3.061739444732666, "logits/rejected": -2.9253592491149902, "logps/chosen": -315.7259826660156, "logps/rejected": -224.159912109375, "loss": 0.4631, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7005783319473267, "rewards/margins": 0.9900630116462708, "rewards/rejected": -1.690641164779663, "step": 1370 }, { "epoch": 0.7221350078492935, "grad_norm": 75.9304561136293, "learning_rate": 4.7048704366921537e-07, "logits/chosen": -3.0375235080718994, "logits/rejected": -2.975628614425659, "logps/chosen": -217.8585968017578, "logps/rejected": -257.65667724609375, "loss": 0.4917, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.514151930809021, "rewards/margins": 1.3070632219314575, "rewards/rejected": -1.8212149143218994, "step": 1380 }, { "epoch": 0.727367870225013, "grad_norm": 69.96551046016322, "learning_rate": 4.6976538957969114e-07, "logits/chosen": -2.9474074840545654, "logits/rejected": -2.8653557300567627, "logps/chosen": -263.21185302734375, "logps/rejected": -232.5277557373047, "loss": 0.523, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8041553497314453, "rewards/margins": 1.2025103569030762, "rewards/rejected": -2.0066657066345215, "step": 1390 }, { "epoch": 0.7326007326007326, "grad_norm": 72.5951585656569, "learning_rate": 4.690355860712163e-07, "logits/chosen": -2.943934679031372, "logits/rejected": -2.953230619430542, "logps/chosen": -239.3934326171875, "logps/rejected": -261.02105712890625, "loss": 0.5181, "rewards/accuracies": 0.75, "rewards/chosen": -0.8976553678512573, "rewards/margins": 0.7806161642074585, "rewards/rejected": -1.6782715320587158, "step": 1400 }, { "epoch": 0.7378335949764521, "grad_norm": 84.78132084141066, "learning_rate": 4.682976602066262e-07, "logits/chosen": -2.885561943054199, "logits/rejected": -2.8480865955352783, "logps/chosen": -256.455078125, "logps/rejected": -256.9288330078125, "loss": 0.5599, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9061183929443359, "rewards/margins": 1.2948997020721436, "rewards/rejected": -2.2010180950164795, "step": 1410 }, { "epoch": 0.7430664573521716, "grad_norm": 86.73766503851282, "learning_rate": 4.6755163934995224e-07, "logits/chosen": -2.988407850265503, "logits/rejected": -2.938638210296631, "logps/chosen": -312.2828674316406, "logps/rejected": -260.7275390625, "loss": 0.5478, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5379031300544739, "rewards/margins": 1.01072096824646, "rewards/rejected": -1.5486242771148682, "step": 1420 }, { "epoch": 0.7482993197278912, "grad_norm": 59.48136153540968, "learning_rate": 4.667975511654072e-07, "logits/chosen": -3.0078577995300293, "logits/rejected": -2.9246108531951904, "logps/chosen": -295.6681213378906, "logps/rejected": -260.7848205566406, "loss": 0.5034, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4367855191230774, "rewards/margins": 0.9930356740951538, "rewards/rejected": -1.4298213720321655, "step": 1430 }, { "epoch": 0.7535321821036107, "grad_norm": 87.9642055959618, "learning_rate": 4.660354236163595e-07, "logits/chosen": -3.0245180130004883, "logits/rejected": -2.956260919570923, "logps/chosen": -353.749267578125, "logps/rejected": -313.7087097167969, "loss": 0.533, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2802979350090027, "rewards/margins": 1.101088285446167, "rewards/rejected": -1.381386160850525, "step": 1440 }, { "epoch": 0.7587650444793302, "grad_norm": 61.27384722059847, "learning_rate": 4.6526528496429606e-07, "logits/chosen": -2.9957804679870605, "logits/rejected": -2.913984775543213, "logps/chosen": -296.39459228515625, "logps/rejected": -274.4482421875, "loss": 0.5095, "rewards/accuracies": 0.75, "rewards/chosen": -0.7744497060775757, "rewards/margins": 1.1928904056549072, "rewards/rejected": -1.967340111732483, "step": 1450 }, { "epoch": 0.7639979068550498, "grad_norm": 63.229061175148374, "learning_rate": 4.644871637677745e-07, "logits/chosen": -2.930814266204834, "logits/rejected": -2.9312026500701904, "logps/chosen": -213.7772979736328, "logps/rejected": -229.32858276367188, "loss": 0.5593, "rewards/accuracies": 0.625, "rewards/chosen": -0.6037440896034241, "rewards/margins": 0.6268049478530884, "rewards/rejected": -1.2305490970611572, "step": 1460 }, { "epoch": 0.7692307692307693, "grad_norm": 66.99497646382378, "learning_rate": 4.637010888813638e-07, "logits/chosen": -2.9983057975769043, "logits/rejected": -2.8795926570892334, "logps/chosen": -331.4161682128906, "logps/rejected": -248.960693359375, "loss": 0.4899, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5756648778915405, "rewards/margins": 1.0333304405212402, "rewards/rejected": -1.6089951992034912, "step": 1470 }, { "epoch": 0.7744636316064888, "grad_norm": 62.111269245971066, "learning_rate": 4.6290708945457493e-07, "logits/chosen": -2.924233913421631, "logits/rejected": -2.923056125640869, "logps/chosen": -248.94961547851562, "logps/rejected": -235.97274780273438, "loss": 0.5445, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8326644897460938, "rewards/margins": 0.763745129108429, "rewards/rejected": -1.5964096784591675, "step": 1480 }, { "epoch": 0.7796964939822083, "grad_norm": 78.32752930375904, "learning_rate": 4.6210519493077887e-07, "logits/chosen": -2.7333154678344727, "logits/rejected": -2.752122640609741, "logps/chosen": -286.59014892578125, "logps/rejected": -282.7780456542969, "loss": 0.5224, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0298734903335571, "rewards/margins": 0.9437181353569031, "rewards/rejected": -1.9735914468765259, "step": 1490 }, { "epoch": 0.7849293563579278, "grad_norm": 65.98583030437617, "learning_rate": 4.6129543504611607e-07, "logits/chosen": -2.932669162750244, "logits/rejected": -2.897218942642212, "logps/chosen": -213.91146850585938, "logps/rejected": -277.2677001953125, "loss": 0.4657, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0067284107208252, "rewards/margins": 1.3459272384643555, "rewards/rejected": -2.3526554107666016, "step": 1500 }, { "epoch": 0.7901622187336473, "grad_norm": 74.85456175547458, "learning_rate": 4.604778398283927e-07, "logits/chosen": -2.877446413040161, "logits/rejected": -2.879443645477295, "logps/chosen": -269.11602783203125, "logps/rejected": -310.3833923339844, "loss": 0.573, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3230334520339966, "rewards/margins": 1.0108563899993896, "rewards/rejected": -2.333889961242676, "step": 1510 }, { "epoch": 0.7953950811093669, "grad_norm": 62.65857661703482, "learning_rate": 4.596524395959678e-07, "logits/chosen": -2.9313912391662598, "logits/rejected": -2.8881571292877197, "logps/chosen": -225.9903564453125, "logps/rejected": -263.2411804199219, "loss": 0.5102, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.772546648979187, "rewards/margins": 1.27035653591156, "rewards/rejected": -2.042903423309326, "step": 1520 }, { "epoch": 0.8006279434850864, "grad_norm": 76.33112334895458, "learning_rate": 4.588192649566285e-07, "logits/chosen": -3.0400984287261963, "logits/rejected": -3.0055220127105713, "logps/chosen": -319.9878234863281, "logps/rejected": -383.8850402832031, "loss": 0.4818, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8490980863571167, "rewards/margins": 1.0591531991958618, "rewards/rejected": -1.908251166343689, "step": 1530 }, { "epoch": 0.8058608058608059, "grad_norm": 58.21946696217978, "learning_rate": 4.5797834680645553e-07, "logits/chosen": -2.9496045112609863, "logits/rejected": -2.963113784790039, "logps/chosen": -362.71173095703125, "logps/rejected": -323.79754638671875, "loss": 0.5616, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9068654775619507, "rewards/margins": 0.7508090138435364, "rewards/rejected": -1.6576745510101318, "step": 1540 }, { "epoch": 0.8110936682365254, "grad_norm": 65.20141163730143, "learning_rate": 4.5712971632867715e-07, "logits/chosen": -2.9228272438049316, "logits/rejected": -2.8310813903808594, "logps/chosen": -321.7740173339844, "logps/rejected": -235.10617065429688, "loss": 0.512, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.4054592251777649, "rewards/margins": 1.2328050136566162, "rewards/rejected": -1.6382644176483154, "step": 1550 }, { "epoch": 0.8163265306122449, "grad_norm": 61.99208692313101, "learning_rate": 4.562734049925129e-07, "logits/chosen": -2.9270553588867188, "logits/rejected": -2.8541605472564697, "logps/chosen": -351.1989440917969, "logps/rejected": -299.54449462890625, "loss": 0.4947, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6397629380226135, "rewards/margins": 0.9832455515861511, "rewards/rejected": -1.623008370399475, "step": 1560 }, { "epoch": 0.8215593929879644, "grad_norm": 65.31679182780346, "learning_rate": 4.5540944455200663e-07, "logits/chosen": -2.937791109085083, "logits/rejected": -2.8882229328155518, "logps/chosen": -237.90646362304688, "logps/rejected": -258.17950439453125, "loss": 0.4937, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.6457556486129761, "rewards/margins": 1.0821702480316162, "rewards/rejected": -1.7279258966445923, "step": 1570 }, { "epoch": 0.826792255363684, "grad_norm": 75.342598138456, "learning_rate": 4.545378670448492e-07, "logits/chosen": -2.992056131362915, "logits/rejected": -2.8762831687927246, "logps/chosen": -287.8721923828125, "logps/rejected": -248.96517944335938, "loss": 0.5459, "rewards/accuracies": 0.75, "rewards/chosen": -0.8856776356697083, "rewards/margins": 0.9507344365119934, "rewards/rejected": -1.8364120721817017, "step": 1580 }, { "epoch": 0.8320251177394035, "grad_norm": 78.04202275781564, "learning_rate": 4.5365870479119014e-07, "logits/chosen": -2.835019588470459, "logits/rejected": -2.756282329559326, "logps/chosen": -239.52523803710938, "logps/rejected": -219.6143035888672, "loss": 0.5192, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5610097646713257, "rewards/margins": 1.3356037139892578, "rewards/rejected": -1.8966137170791626, "step": 1590 }, { "epoch": 0.837257980115123, "grad_norm": 79.23764068610349, "learning_rate": 4.5277199039243917e-07, "logits/chosen": -2.8543238639831543, "logits/rejected": -2.8768668174743652, "logps/chosen": -260.81085205078125, "logps/rejected": -289.78619384765625, "loss": 0.5036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1091424226760864, "rewards/margins": 0.7832223176956177, "rewards/rejected": -1.8923648595809937, "step": 1600 }, { "epoch": 0.8424908424908425, "grad_norm": 80.148644815323, "learning_rate": 4.5187775673005744e-07, "logits/chosen": -3.0078282356262207, "logits/rejected": -2.908257484436035, "logps/chosen": -368.0081481933594, "logps/rejected": -334.0032653808594, "loss": 0.5359, "rewards/accuracies": 0.75, "rewards/chosen": -0.6377593278884888, "rewards/margins": 1.1235153675079346, "rewards/rejected": -1.761275053024292, "step": 1610 }, { "epoch": 0.847723704866562, "grad_norm": 47.303920944920755, "learning_rate": 4.509760369643384e-07, "logits/chosen": -2.936072826385498, "logits/rejected": -2.8372721672058105, "logps/chosen": -285.2725524902344, "logps/rejected": -244.05838012695312, "loss": 0.5387, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.9257488250732422, "rewards/margins": 0.7606317400932312, "rewards/rejected": -1.6863806247711182, "step": 1620 }, { "epoch": 0.8529565672422815, "grad_norm": 76.3887584159238, "learning_rate": 4.5006686453317734e-07, "logits/chosen": -3.039933919906616, "logits/rejected": -3.059108257293701, "logps/chosen": -240.77737426757812, "logps/rejected": -255.63137817382812, "loss": 0.5135, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6387203335762024, "rewards/margins": 1.0904393196105957, "rewards/rejected": -1.7291595935821533, "step": 1630 }, { "epoch": 0.858189429618001, "grad_norm": 70.20382429855236, "learning_rate": 4.4915027315083243e-07, "logits/chosen": -2.955493450164795, "logits/rejected": -2.949704170227051, "logps/chosen": -309.3811950683594, "logps/rejected": -293.6394958496094, "loss": 0.5412, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5995006561279297, "rewards/margins": 0.9805235862731934, "rewards/rejected": -1.580024242401123, "step": 1640 }, { "epoch": 0.8634222919937206, "grad_norm": 73.3896694390141, "learning_rate": 4.482262968066737e-07, "logits/chosen": -2.9428184032440186, "logits/rejected": -2.8880410194396973, "logps/chosen": -281.72601318359375, "logps/rejected": -288.54901123046875, "loss": 0.4927, "rewards/accuracies": 0.625, "rewards/chosen": -0.8272639513015747, "rewards/margins": 0.5991584062576294, "rewards/rejected": -1.4264222383499146, "step": 1650 }, { "epoch": 0.8686551543694401, "grad_norm": 64.19740166460144, "learning_rate": 4.4729496976392324e-07, "logits/chosen": -2.9484705924987793, "logits/rejected": -2.9057557582855225, "logps/chosen": -214.6417236328125, "logps/rejected": -253.85922241210938, "loss": 0.5453, "rewards/accuracies": 0.75, "rewards/chosen": -0.7391083240509033, "rewards/margins": 1.0700409412384033, "rewards/rejected": -1.8091493844985962, "step": 1660 }, { "epoch": 0.8738880167451596, "grad_norm": 90.1838629771822, "learning_rate": 4.463563265583843e-07, "logits/chosen": -3.059415817260742, "logits/rejected": -2.970858573913574, "logps/chosen": -264.70928955078125, "logps/rejected": -267.13140869140625, "loss": 0.5109, "rewards/accuracies": 0.75, "rewards/chosen": -0.782135546207428, "rewards/margins": 1.1809965372085571, "rewards/rejected": -1.9631319046020508, "step": 1670 }, { "epoch": 0.8791208791208791, "grad_norm": 72.79694967318586, "learning_rate": 4.4541040199716063e-07, "logits/chosen": -2.9132349491119385, "logits/rejected": -2.9103472232818604, "logps/chosen": -256.96710205078125, "logps/rejected": -281.34881591796875, "loss": 0.4809, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.746195912361145, "rewards/margins": 1.1086575984954834, "rewards/rejected": -1.8548533916473389, "step": 1680 }, { "epoch": 0.8843537414965986, "grad_norm": 83.92588227947329, "learning_rate": 4.4445723115736587e-07, "logits/chosen": -2.8734400272369385, "logits/rejected": -2.831774950027466, "logps/chosen": -254.5052490234375, "logps/rejected": -241.2642059326172, "loss": 0.4861, "rewards/accuracies": 0.75, "rewards/chosen": -0.7076843976974487, "rewards/margins": 1.2480239868164062, "rewards/rejected": -1.9557082653045654, "step": 1690 }, { "epoch": 0.8895866038723181, "grad_norm": 68.92089463358818, "learning_rate": 4.434968493848228e-07, "logits/chosen": -2.939466953277588, "logits/rejected": -2.8940346240997314, "logps/chosen": -276.1540222167969, "logps/rejected": -269.23651123046875, "loss": 0.501, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8484053611755371, "rewards/margins": 1.0002186298370361, "rewards/rejected": -1.8486239910125732, "step": 1700 }, { "epoch": 0.8948194662480377, "grad_norm": 74.6183426707318, "learning_rate": 4.425292922927525e-07, "logits/chosen": -2.962808132171631, "logits/rejected": -2.8929078578948975, "logps/chosen": -329.1026306152344, "logps/rejected": -325.50762939453125, "loss": 0.505, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5588365793228149, "rewards/margins": 1.0237174034118652, "rewards/rejected": -1.5825541019439697, "step": 1710 }, { "epoch": 0.9000523286237572, "grad_norm": 60.76408150661586, "learning_rate": 4.41554595760454e-07, "logits/chosen": -3.0057425498962402, "logits/rejected": -2.878847599029541, "logps/chosen": -292.86590576171875, "logps/rejected": -256.8815002441406, "loss": 0.5138, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9612002372741699, "rewards/margins": 0.5400224924087524, "rewards/rejected": -1.5012223720550537, "step": 1720 }, { "epoch": 0.9052851909994767, "grad_norm": 70.2443288323978, "learning_rate": 4.4057279593197326e-07, "logits/chosen": -2.9672746658325195, "logits/rejected": -2.921595573425293, "logps/chosen": -229.3719940185547, "logps/rejected": -196.8295440673828, "loss": 0.4894, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5752571225166321, "rewards/margins": 1.0501275062561035, "rewards/rejected": -1.6253846883773804, "step": 1730 }, { "epoch": 0.9105180533751962, "grad_norm": 94.10734553114743, "learning_rate": 4.395839292147637e-07, "logits/chosen": -2.991774559020996, "logits/rejected": -2.8601903915405273, "logps/chosen": -253.67721557617188, "logps/rejected": -225.7364959716797, "loss": 0.5527, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9368684887886047, "rewards/margins": 0.8404407501220703, "rewards/rejected": -1.7773091793060303, "step": 1740 }, { "epoch": 0.9157509157509157, "grad_norm": 79.05614285437254, "learning_rate": 4.3858803227833526e-07, "logits/chosen": -2.9919047355651855, "logits/rejected": -2.968234062194824, "logps/chosen": -327.2932434082031, "logps/rejected": -296.3719482421875, "loss": 0.5538, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6158936619758606, "rewards/margins": 1.1644703149795532, "rewards/rejected": -1.7803640365600586, "step": 1750 }, { "epoch": 0.9209837781266352, "grad_norm": 82.53708968971335, "learning_rate": 4.375851420528951e-07, "logits/chosen": -3.005582809448242, "logits/rejected": -2.977659225463867, "logps/chosen": -228.62936401367188, "logps/rejected": -217.94387817382812, "loss": 0.4942, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7542533278465271, "rewards/margins": 0.6736462712287903, "rewards/rejected": -1.4278995990753174, "step": 1760 }, { "epoch": 0.9262166405023547, "grad_norm": 74.53028463723999, "learning_rate": 4.36575295727978e-07, "logits/chosen": -2.846735954284668, "logits/rejected": -2.7917206287384033, "logps/chosen": -294.39068603515625, "logps/rejected": -262.35687255859375, "loss": 0.5309, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.830483078956604, "rewards/margins": 1.0045565366744995, "rewards/rejected": -1.835039496421814, "step": 1770 }, { "epoch": 0.9314495028780743, "grad_norm": 96.74436764011531, "learning_rate": 4.355585307510675e-07, "logits/chosen": -2.8927388191223145, "logits/rejected": -2.8774728775024414, "logps/chosen": -243.9031219482422, "logps/rejected": -219.56820678710938, "loss": 0.543, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8626550436019897, "rewards/margins": 0.6962482333183289, "rewards/rejected": -1.5589033365249634, "step": 1780 }, { "epoch": 0.9366823652537938, "grad_norm": 67.89499594122816, "learning_rate": 4.345348848262068e-07, "logits/chosen": -2.9423208236694336, "logits/rejected": -2.9689524173736572, "logps/chosen": -328.1993103027344, "logps/rejected": -333.61737060546875, "loss": 0.4952, "rewards/accuracies": 0.75, "rewards/chosen": -0.3705378472805023, "rewards/margins": 1.0374945402145386, "rewards/rejected": -1.4080322980880737, "step": 1790 }, { "epoch": 0.9419152276295133, "grad_norm": 90.12114112115178, "learning_rate": 4.33504395912601e-07, "logits/chosen": -2.802839517593384, "logits/rejected": -2.7483413219451904, "logps/chosen": -239.0595245361328, "logps/rejected": -282.29296875, "loss": 0.4933, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.033031940460205, "rewards/margins": 1.43924081325531, "rewards/rejected": -2.4722728729248047, "step": 1800 }, { "epoch": 0.9471480900052328, "grad_norm": 74.2369211953531, "learning_rate": 4.324671022232095e-07, "logits/chosen": -2.978280544281006, "logits/rejected": -2.9162352085113525, "logps/chosen": -262.71002197265625, "logps/rejected": -237.2100372314453, "loss": 0.529, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8442853093147278, "rewards/margins": 1.0907611846923828, "rewards/rejected": -1.9350464344024658, "step": 1810 }, { "epoch": 0.9523809523809523, "grad_norm": 65.58380637981043, "learning_rate": 4.314230422233286e-07, "logits/chosen": -2.886277675628662, "logits/rejected": -2.8241159915924072, "logps/chosen": -219.9335479736328, "logps/rejected": -196.04000854492188, "loss": 0.5326, "rewards/accuracies": 0.75, "rewards/chosen": -0.7239634394645691, "rewards/margins": 1.0665786266326904, "rewards/rejected": -1.7905420064926147, "step": 1820 }, { "epoch": 0.957613814756672, "grad_norm": 97.31347703765975, "learning_rate": 4.303722546291655e-07, "logits/chosen": -3.033245086669922, "logits/rejected": -2.995089530944824, "logps/chosen": -288.9032287597656, "logps/rejected": -251.2638397216797, "loss": 0.5652, "rewards/accuracies": 0.625, "rewards/chosen": -0.8516397476196289, "rewards/margins": 0.8074023127555847, "rewards/rejected": -1.6590421199798584, "step": 1830 }, { "epoch": 0.9628466771323915, "grad_norm": 82.82559537740073, "learning_rate": 4.2931477840640243e-07, "logits/chosen": -2.971721649169922, "logits/rejected": -2.8376855850219727, "logps/chosen": -332.58294677734375, "logps/rejected": -284.28387451171875, "loss": 0.5048, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8231459856033325, "rewards/margins": 1.1778504848480225, "rewards/rejected": -2.0009963512420654, "step": 1840 }, { "epoch": 0.968079539508111, "grad_norm": 80.52647552234426, "learning_rate": 4.282506527687517e-07, "logits/chosen": -2.886962652206421, "logits/rejected": -2.8580102920532227, "logps/chosen": -374.1407775878906, "logps/rejected": -310.34735107421875, "loss": 0.5194, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.33071595430374146, "rewards/margins": 1.2728602886199951, "rewards/rejected": -1.6035760641098022, "step": 1850 }, { "epoch": 0.9733124018838305, "grad_norm": 64.28036259389684, "learning_rate": 4.271799171765016e-07, "logits/chosen": -2.9039340019226074, "logits/rejected": -2.779991626739502, "logps/chosen": -326.9325256347656, "logps/rejected": -248.93505859375, "loss": 0.5009, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.903876781463623, "rewards/margins": 0.9615135192871094, "rewards/rejected": -1.8653901815414429, "step": 1860 }, { "epoch": 0.97854526425955, "grad_norm": 113.20755555794447, "learning_rate": 4.2610261133505323e-07, "logits/chosen": -2.9402170181274414, "logits/rejected": -2.8585944175720215, "logps/chosen": -249.55673217773438, "logps/rejected": -245.0098114013672, "loss": 0.5251, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6474102735519409, "rewards/margins": 0.8552484512329102, "rewards/rejected": -1.5026586055755615, "step": 1870 }, { "epoch": 0.9837781266352695, "grad_norm": 72.99593226897008, "learning_rate": 4.250187751934479e-07, "logits/chosen": -2.986501455307007, "logits/rejected": -3.03943133354187, "logps/chosen": -253.8631591796875, "logps/rejected": -326.8650817871094, "loss": 0.482, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4290796220302582, "rewards/margins": 1.1297729015350342, "rewards/rejected": -1.5588525533676147, "step": 1880 }, { "epoch": 0.989010989010989, "grad_norm": 83.73527655023146, "learning_rate": 4.2392844894288605e-07, "logits/chosen": -2.9131972789764404, "logits/rejected": -2.8745169639587402, "logps/chosen": -392.22113037109375, "logps/rejected": -340.94036865234375, "loss": 0.5179, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7516988515853882, "rewards/margins": 0.8088757395744324, "rewards/rejected": -1.5605746507644653, "step": 1890 }, { "epoch": 0.9942438513867086, "grad_norm": 72.98333786453112, "learning_rate": 4.2283167301523634e-07, "logits/chosen": -3.012432336807251, "logits/rejected": -2.904952049255371, "logps/chosen": -229.73739624023438, "logps/rejected": -234.2970733642578, "loss": 0.5293, "rewards/accuracies": 0.75, "rewards/chosen": -0.9742595553398132, "rewards/margins": 1.050612211227417, "rewards/rejected": -2.024871826171875, "step": 1900 }, { "epoch": 0.9994767137624281, "grad_norm": 103.09163962748354, "learning_rate": 4.217284880815369e-07, "logits/chosen": -2.9096579551696777, "logits/rejected": -2.92535662651062, "logps/chosen": -332.30694580078125, "logps/rejected": -341.6778564453125, "loss": 0.528, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6502354741096497, "rewards/margins": 1.4920222759246826, "rewards/rejected": -2.1422576904296875, "step": 1910 }, { "epoch": 1.0047095761381475, "grad_norm": 45.45151110053572, "learning_rate": 4.2061893505048694e-07, "logits/chosen": -2.9303088188171387, "logits/rejected": -2.910794734954834, "logps/chosen": -195.44097900390625, "logps/rejected": -268.0443420410156, "loss": 0.168, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.13074231147766113, "rewards/margins": 2.848576545715332, "rewards/rejected": -2.717834234237671, "step": 1920 }, { "epoch": 1.0099424385138671, "grad_norm": 17.745483065934703, "learning_rate": 4.1950305506692967e-07, "logits/chosen": -3.0389623641967773, "logits/rejected": -2.912374973297119, "logps/chosen": -310.67724609375, "logps/rejected": -288.9321594238281, "loss": 0.1027, "rewards/accuracies": 1.0, "rewards/chosen": 1.0739758014678955, "rewards/margins": 4.982864856719971, "rewards/rejected": -3.9088892936706543, "step": 1930 }, { "epoch": 1.0151753008895865, "grad_norm": 20.133920280884652, "learning_rate": 4.1838088951032656e-07, "logits/chosen": -2.8195080757141113, "logits/rejected": -2.793968677520752, "logps/chosen": -335.138427734375, "logps/rejected": -314.4246520996094, "loss": 0.1198, "rewards/accuracies": 1.0, "rewards/chosen": 0.8558192253112793, "rewards/margins": 5.03140926361084, "rewards/rejected": -4.1755900382995605, "step": 1940 }, { "epoch": 1.0204081632653061, "grad_norm": 15.728642010828425, "learning_rate": 4.172524799932231e-07, "logits/chosen": -2.9151108264923096, "logits/rejected": -2.876267910003662, "logps/chosen": -209.6746826171875, "logps/rejected": -280.47491455078125, "loss": 0.1023, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05758916214108467, "rewards/margins": 3.8056609630584717, "rewards/rejected": -3.7480721473693848, "step": 1950 }, { "epoch": 1.0256410256410255, "grad_norm": 19.77656145760191, "learning_rate": 4.161178683597054e-07, "logits/chosen": -3.020782947540283, "logits/rejected": -2.8830416202545166, "logps/chosen": -251.02490234375, "logps/rejected": -230.2695770263672, "loss": 0.0928, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.22050175070762634, "rewards/margins": 3.800748109817505, "rewards/rejected": -4.021250247955322, "step": 1960 }, { "epoch": 1.0308738880167452, "grad_norm": 45.037676964021735, "learning_rate": 4.1497709668384885e-07, "logits/chosen": -2.987654209136963, "logits/rejected": -2.9256956577301025, "logps/chosen": -335.3683166503906, "logps/rejected": -320.40216064453125, "loss": 0.0968, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6408755779266357, "rewards/margins": 5.110548973083496, "rewards/rejected": -4.469674110412598, "step": 1970 }, { "epoch": 1.0361067503924646, "grad_norm": 34.980472502713724, "learning_rate": 4.1383020726815745e-07, "logits/chosen": -2.988145351409912, "logits/rejected": -2.8947675228118896, "logps/chosen": -240.918701171875, "logps/rejected": -275.3710021972656, "loss": 0.1078, "rewards/accuracies": 0.875, "rewards/chosen": -0.5477129817008972, "rewards/margins": 3.915433406829834, "rewards/rejected": -4.463146686553955, "step": 1980 }, { "epoch": 1.0413396127681842, "grad_norm": 23.237301345593345, "learning_rate": 4.126772426419959e-07, "logits/chosen": -2.8984646797180176, "logits/rejected": -2.911050319671631, "logps/chosen": -254.62478637695312, "logps/rejected": -298.11767578125, "loss": 0.1464, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.25509607791900635, "rewards/margins": 3.721541166305542, "rewards/rejected": -3.976637363433838, "step": 1990 }, { "epoch": 1.0465724751439036, "grad_norm": 14.701845192629634, "learning_rate": 4.1151824556001145e-07, "logits/chosen": -2.9849514961242676, "logits/rejected": -2.929068088531494, "logps/chosen": -223.71517944335938, "logps/rejected": -289.09674072265625, "loss": 0.1188, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.07411936670541763, "rewards/margins": 4.153815269470215, "rewards/rejected": -4.227934837341309, "step": 2000 }, { "epoch": 1.0465724751439036, "eval_logits/chosen": -2.947195291519165, "eval_logits/rejected": -2.9056789875030518, "eval_logps/chosen": -272.0160827636719, "eval_logps/rejected": -291.4879455566406, "eval_loss": 0.5384762287139893, "eval_rewards/accuracies": 0.78515625, "eval_rewards/chosen": -1.313662052154541, "eval_rewards/margins": 1.5186362266540527, "eval_rewards/rejected": -2.8322982788085938, "eval_runtime": 96.9405, "eval_samples_per_second": 20.631, "eval_steps_per_second": 0.33, "step": 2000 }, { "epoch": 1.0518053375196232, "grad_norm": 5.4898175926196435, "learning_rate": 4.103532590005495e-07, "logits/chosen": -3.0342893600463867, "logits/rejected": -2.9569826126098633, "logps/chosen": -263.30303955078125, "logps/rejected": -244.82742309570312, "loss": 0.0843, "rewards/accuracies": 1.0, "rewards/chosen": 0.5069825053215027, "rewards/margins": 5.234126091003418, "rewards/rejected": -4.72714376449585, "step": 2010 }, { "epoch": 1.0570381998953426, "grad_norm": 11.25845376335133, "learning_rate": 4.091823261640592e-07, "logits/chosen": -2.9733669757843018, "logits/rejected": -2.8974101543426514, "logps/chosen": -239.6688232421875, "logps/rejected": -249.09158325195312, "loss": 0.1125, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7533796429634094, "rewards/margins": 5.6069254875183105, "rewards/rejected": -4.853545665740967, "step": 2020 }, { "epoch": 1.0622710622710623, "grad_norm": 8.19083172472987, "learning_rate": 4.080054904714917e-07, "logits/chosen": -2.9797985553741455, "logits/rejected": -2.9285874366760254, "logps/chosen": -233.4443359375, "logps/rejected": -268.8140869140625, "loss": 0.1038, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.46526265144348145, "rewards/margins": 4.018275260925293, "rewards/rejected": -4.4835381507873535, "step": 2030 }, { "epoch": 1.0675039246467817, "grad_norm": 9.248653125902187, "learning_rate": 4.0682279556268993e-07, "logits/chosen": -2.9472954273223877, "logits/rejected": -2.925903081893921, "logps/chosen": -313.2281188964844, "logps/rejected": -345.99652099609375, "loss": 0.0841, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2679436206817627, "rewards/margins": 5.440550327301025, "rewards/rejected": -5.172606945037842, "step": 2040 }, { "epoch": 1.0727367870225013, "grad_norm": 63.835597118607524, "learning_rate": 4.056342852947706e-07, "logits/chosen": -3.0942771434783936, "logits/rejected": -2.9428200721740723, "logps/chosen": -351.092041015625, "logps/rejected": -334.16302490234375, "loss": 0.0953, "rewards/accuracies": 1.0, "rewards/chosen": 0.018678603693842888, "rewards/margins": 5.577530384063721, "rewards/rejected": -5.55885124206543, "step": 2050 }, { "epoch": 1.077969649398221, "grad_norm": 16.869312857912206, "learning_rate": 4.044400037404973e-07, "logits/chosen": -2.992830514907837, "logits/rejected": -2.9382407665252686, "logps/chosen": -206.66207885742188, "logps/rejected": -231.7269287109375, "loss": 0.0982, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.14568844437599182, "rewards/margins": 4.139792442321777, "rewards/rejected": -4.2854814529418945, "step": 2060 }, { "epoch": 1.0832025117739403, "grad_norm": 18.80230401570007, "learning_rate": 4.032399951866468e-07, "logits/chosen": -2.901803493499756, "logits/rejected": -2.8125548362731934, "logps/chosen": -217.5782470703125, "logps/rejected": -230.0740509033203, "loss": 0.1058, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6830385327339172, "rewards/margins": 3.996809482574463, "rewards/rejected": -4.679847717285156, "step": 2070 }, { "epoch": 1.08843537414966, "grad_norm": 12.769407460238632, "learning_rate": 4.0203430413236637e-07, "logits/chosen": -3.0240349769592285, "logits/rejected": -2.9826395511627197, "logps/chosen": -281.54083251953125, "logps/rejected": -336.9918518066406, "loss": 0.1116, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.060561299324035645, "rewards/margins": 4.970131874084473, "rewards/rejected": -5.0306925773620605, "step": 2080 }, { "epoch": 1.0936682365253794, "grad_norm": 24.862971968501444, "learning_rate": 4.0082297528752407e-07, "logits/chosen": -2.9137930870056152, "logits/rejected": -2.8261656761169434, "logps/chosen": -186.62130737304688, "logps/rejected": -241.3701171875, "loss": 0.1014, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.18522152304649353, "rewards/margins": 4.848191738128662, "rewards/rejected": -5.033412933349609, "step": 2090 }, { "epoch": 1.098901098901099, "grad_norm": 33.207637255161075, "learning_rate": 3.9960605357105e-07, "logits/chosen": -2.9810373783111572, "logits/rejected": -2.9175047874450684, "logps/chosen": -265.79638671875, "logps/rejected": -289.8681640625, "loss": 0.1094, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.41211724281311035, "rewards/margins": 4.841708183288574, "rewards/rejected": -5.253826141357422, "step": 2100 }, { "epoch": 1.1041339612768184, "grad_norm": 12.889413654124535, "learning_rate": 3.983835841092716e-07, "logits/chosen": -2.988010883331299, "logits/rejected": -2.8389334678649902, "logps/chosen": -298.5379333496094, "logps/rejected": -243.05020141601562, "loss": 0.1132, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.21895161271095276, "rewards/margins": 4.597933292388916, "rewards/rejected": -4.816884517669678, "step": 2110 }, { "epoch": 1.109366823652538, "grad_norm": 44.34027791263567, "learning_rate": 3.971556122342398e-07, "logits/chosen": -2.9909846782684326, "logits/rejected": -2.9145379066467285, "logps/chosen": -256.9250793457031, "logps/rejected": -251.4674072265625, "loss": 0.1147, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.36800098419189453, "rewards/margins": 3.637465715408325, "rewards/rejected": -4.005466938018799, "step": 2120 }, { "epoch": 1.1145996860282574, "grad_norm": 37.40175870472002, "learning_rate": 3.9592218348204766e-07, "logits/chosen": -3.0402302742004395, "logits/rejected": -2.942626476287842, "logps/chosen": -273.3143615722656, "logps/rejected": -279.45196533203125, "loss": 0.0972, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.3278760015964508, "rewards/margins": 3.79376220703125, "rewards/rejected": -4.12163782119751, "step": 2130 }, { "epoch": 1.119832548403977, "grad_norm": 23.201646210003346, "learning_rate": 3.946833435911423e-07, "logits/chosen": -3.0641751289367676, "logits/rejected": -2.9351677894592285, "logps/chosen": -235.9039764404297, "logps/rejected": -250.8444366455078, "loss": 0.1242, "rewards/accuracies": 1.0, "rewards/chosen": -0.040074944496154785, "rewards/margins": 5.134432315826416, "rewards/rejected": -5.174508094787598, "step": 2140 }, { "epoch": 1.1250654107796965, "grad_norm": 12.524390221499234, "learning_rate": 3.9343913850062856e-07, "logits/chosen": -2.9544901847839355, "logits/rejected": -3.0499377250671387, "logps/chosen": -213.7620849609375, "logps/rejected": -315.26361083984375, "loss": 0.1188, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.7512770295143127, "rewards/margins": 4.217713356018066, "rewards/rejected": -4.968990325927734, "step": 2150 }, { "epoch": 1.130298273155416, "grad_norm": 15.47961194431284, "learning_rate": 3.921896143485657e-07, "logits/chosen": -2.9674363136291504, "logits/rejected": -2.914391040802002, "logps/chosen": -266.4778137207031, "logps/rejected": -287.54742431640625, "loss": 0.1359, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5546332597732544, "rewards/margins": 4.210048675537109, "rewards/rejected": -4.764682292938232, "step": 2160 }, { "epoch": 1.1355311355311355, "grad_norm": 36.433324301924145, "learning_rate": 3.9093481747025615e-07, "logits/chosen": -3.1089160442352295, "logits/rejected": -3.019096851348877, "logps/chosen": -295.01739501953125, "logps/rejected": -297.8597106933594, "loss": 0.0959, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.24217386543750763, "rewards/margins": 4.738219738006592, "rewards/rejected": -4.980393886566162, "step": 2170 }, { "epoch": 1.1407639979068551, "grad_norm": 13.893960640459499, "learning_rate": 3.896747943965275e-07, "logits/chosen": -3.0672404766082764, "logits/rejected": -2.9155139923095703, "logps/chosen": -244.2855987548828, "logps/rejected": -269.7584228515625, "loss": 0.0932, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7278806567192078, "rewards/margins": 4.4255900382995605, "rewards/rejected": -5.153470993041992, "step": 2180 }, { "epoch": 1.1459968602825745, "grad_norm": 7.336646579634609, "learning_rate": 3.8840959185200717e-07, "logits/chosen": -2.9617323875427246, "logits/rejected": -3.0029149055480957, "logps/chosen": -257.20550537109375, "logps/rejected": -280.87457275390625, "loss": 0.0885, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.21207909286022186, "rewards/margins": 4.368792533874512, "rewards/rejected": -4.580871105194092, "step": 2190 }, { "epoch": 1.1512297226582942, "grad_norm": 34.27947221758922, "learning_rate": 3.871392567533893e-07, "logits/chosen": -3.0441932678222656, "logits/rejected": -2.945539951324463, "logps/chosen": -306.1650390625, "logps/rejected": -307.3845520019531, "loss": 0.0869, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3887832462787628, "rewards/margins": 4.506525993347168, "rewards/rejected": -4.895309925079346, "step": 2200 }, { "epoch": 1.1564625850340136, "grad_norm": 27.95632068581484, "learning_rate": 3.858638362076953e-07, "logits/chosen": -2.955258846282959, "logits/rejected": -2.8569209575653076, "logps/chosen": -266.22637939453125, "logps/rejected": -279.43621826171875, "loss": 0.0934, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.10717006772756577, "rewards/margins": 4.3686370849609375, "rewards/rejected": -4.475807189941406, "step": 2210 }, { "epoch": 1.1616954474097332, "grad_norm": 16.658669512680717, "learning_rate": 3.845833775105272e-07, "logits/chosen": -2.9852311611175537, "logits/rejected": -2.973335027694702, "logps/chosen": -241.55813598632812, "logps/rejected": -304.9690856933594, "loss": 0.0666, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1673494279384613, "rewards/margins": 5.362492084503174, "rewards/rejected": -5.195141792297363, "step": 2220 }, { "epoch": 1.1669283097854526, "grad_norm": 40.28487869574936, "learning_rate": 3.832979281443133e-07, "logits/chosen": -3.048990488052368, "logits/rejected": -3.0380141735076904, "logps/chosen": -248.06668090820312, "logps/rejected": -279.40386962890625, "loss": 0.1005, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.15961989760398865, "rewards/margins": 4.502907752990723, "rewards/rejected": -4.343287467956543, "step": 2230 }, { "epoch": 1.1721611721611722, "grad_norm": 18.387745709398864, "learning_rate": 3.8200753577654765e-07, "logits/chosen": -3.0344555377960205, "logits/rejected": -2.9449851512908936, "logps/chosen": -235.535888671875, "logps/rejected": -283.6649475097656, "loss": 0.1201, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4938398003578186, "rewards/margins": 4.636073589324951, "rewards/rejected": -5.129913330078125, "step": 2240 }, { "epoch": 1.1773940345368916, "grad_norm": 24.597628806920685, "learning_rate": 3.8071224825802273e-07, "logits/chosen": -3.0635933876037598, "logits/rejected": -3.0628323554992676, "logps/chosen": -291.4552307128906, "logps/rejected": -367.97503662109375, "loss": 0.0876, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2545695900917053, "rewards/margins": 4.914561748504639, "rewards/rejected": -5.169131278991699, "step": 2250 }, { "epoch": 1.1826268969126112, "grad_norm": 19.389935919599534, "learning_rate": 3.7941211362105453e-07, "logits/chosen": -3.0812182426452637, "logits/rejected": -2.9905319213867188, "logps/chosen": -300.0808410644531, "logps/rejected": -358.9996643066406, "loss": 0.09, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.12481093406677246, "rewards/margins": 5.00175666809082, "rewards/rejected": -4.876945495605469, "step": 2260 }, { "epoch": 1.1878597592883307, "grad_norm": 25.06592048237873, "learning_rate": 3.781071800777017e-07, "logits/chosen": -2.910217761993408, "logits/rejected": -2.8946588039398193, "logps/chosen": -292.1333312988281, "logps/rejected": -329.64154052734375, "loss": 0.0932, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.06385570019483566, "rewards/margins": 5.579528331756592, "rewards/rejected": -5.643383979797363, "step": 2270 }, { "epoch": 1.1930926216640503, "grad_norm": 36.764874316766445, "learning_rate": 3.767974960179776e-07, "logits/chosen": -3.024369478225708, "logits/rejected": -2.9892513751983643, "logps/chosen": -242.84286499023438, "logps/rejected": -282.8974304199219, "loss": 0.0779, "rewards/accuracies": 1.0, "rewards/chosen": -0.6096557974815369, "rewards/margins": 4.790216445922852, "rewards/rejected": -5.399872779846191, "step": 2280 }, { "epoch": 1.1983254840397697, "grad_norm": 27.90447287398062, "learning_rate": 3.7548311000805605e-07, "logits/chosen": -2.9292654991149902, "logits/rejected": -2.9386374950408936, "logps/chosen": -255.7118377685547, "logps/rejected": -350.54693603515625, "loss": 0.1002, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.3197168707847595, "rewards/margins": 5.395267486572266, "rewards/rejected": -5.714983940124512, "step": 2290 }, { "epoch": 1.2035583464154893, "grad_norm": 12.347204911318942, "learning_rate": 3.7416407078847015e-07, "logits/chosen": -3.0850939750671387, "logits/rejected": -3.0658371448516846, "logps/chosen": -286.9936218261719, "logps/rejected": -343.52215576171875, "loss": 0.0961, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.49446773529052734, "rewards/margins": 4.931674480438232, "rewards/rejected": -5.426142692565918, "step": 2300 }, { "epoch": 1.2087912087912087, "grad_norm": 22.59463524668268, "learning_rate": 3.7284042727230506e-07, "logits/chosen": -3.064293384552002, "logits/rejected": -2.9377732276916504, "logps/chosen": -214.724853515625, "logps/rejected": -267.16845703125, "loss": 0.1058, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6700494289398193, "rewards/margins": 5.030571460723877, "rewards/rejected": -5.700620174407959, "step": 2310 }, { "epoch": 1.2140240711669283, "grad_norm": 23.913733296036085, "learning_rate": 3.7151222854338413e-07, "logits/chosen": -3.0494749546051025, "logits/rejected": -2.8941361904144287, "logps/chosen": -303.7664489746094, "logps/rejected": -314.77801513671875, "loss": 0.1032, "rewards/accuracies": 1.0, "rewards/chosen": 0.13266456127166748, "rewards/margins": 6.055363655090332, "rewards/rejected": -5.922699451446533, "step": 2320 }, { "epoch": 1.2192569335426477, "grad_norm": 22.912357258166526, "learning_rate": 3.701795238544488e-07, "logits/chosen": -3.0133302211761475, "logits/rejected": -2.944756031036377, "logps/chosen": -291.0356750488281, "logps/rejected": -322.6231689453125, "loss": 0.0977, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.2619327902793884, "rewards/margins": 5.150561332702637, "rewards/rejected": -5.412493705749512, "step": 2330 }, { "epoch": 1.2244897959183674, "grad_norm": 12.600218037639086, "learning_rate": 3.688423626253318e-07, "logits/chosen": -2.86317777633667, "logits/rejected": -2.9234671592712402, "logps/chosen": -205.64169311523438, "logps/rejected": -267.68408203125, "loss": 0.1035, "rewards/accuracies": 1.0, "rewards/chosen": -0.4902506470680237, "rewards/margins": 5.09896183013916, "rewards/rejected": -5.589212894439697, "step": 2340 }, { "epoch": 1.2297226582940868, "grad_norm": 18.15504015192863, "learning_rate": 3.675007944411253e-07, "logits/chosen": -3.0373623371124268, "logits/rejected": -2.9609227180480957, "logps/chosen": -286.0845947265625, "logps/rejected": -278.6300964355469, "loss": 0.1502, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.06021306663751602, "rewards/margins": 4.626088619232178, "rewards/rejected": -4.56587553024292, "step": 2350 }, { "epoch": 1.2349555206698064, "grad_norm": 16.32453139836447, "learning_rate": 3.6615486905034167e-07, "logits/chosen": -3.0100817680358887, "logits/rejected": -2.9484076499938965, "logps/chosen": -298.12603759765625, "logps/rejected": -295.726318359375, "loss": 0.0798, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3867092728614807, "rewards/margins": 4.279193878173828, "rewards/rejected": -4.665903091430664, "step": 2360 }, { "epoch": 1.2401883830455258, "grad_norm": 24.95834600094077, "learning_rate": 3.6480463636306846e-07, "logits/chosen": -3.031393527984619, "logits/rejected": -2.9798049926757812, "logps/chosen": -313.07208251953125, "logps/rejected": -337.24041748046875, "loss": 0.0986, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5317610502243042, "rewards/margins": 4.316946506500244, "rewards/rejected": -4.848707675933838, "step": 2370 }, { "epoch": 1.2454212454212454, "grad_norm": 21.77498359653708, "learning_rate": 3.634501464491183e-07, "logits/chosen": -3.0181055068969727, "logits/rejected": -2.9522647857666016, "logps/chosen": -243.27285766601562, "logps/rejected": -307.166015625, "loss": 0.087, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.3513970971107483, "rewards/margins": 4.9188337326049805, "rewards/rejected": -5.270231246948242, "step": 2380 }, { "epoch": 1.250654107796965, "grad_norm": 18.79354472746182, "learning_rate": 3.6209144953617175e-07, "logits/chosen": -2.804724931716919, "logits/rejected": -2.8318138122558594, "logps/chosen": -348.09234619140625, "logps/rejected": -440.3851623535156, "loss": 0.082, "rewards/accuracies": 1.0, "rewards/chosen": -0.36119604110717773, "rewards/margins": 6.297250270843506, "rewards/rejected": -6.658445835113525, "step": 2390 }, { "epoch": 1.2558869701726845, "grad_norm": 33.47192035215277, "learning_rate": 3.607285960079146e-07, "logits/chosen": -3.0428807735443115, "logits/rejected": -2.9662954807281494, "logps/chosen": -316.5406799316406, "logps/rejected": -343.9951477050781, "loss": 0.1014, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.10287211090326309, "rewards/margins": 6.410679817199707, "rewards/rejected": -6.307806968688965, "step": 2400 }, { "epoch": 1.2611198325484039, "grad_norm": 57.89479100307633, "learning_rate": 3.593616364021701e-07, "logits/chosen": -3.063929796218872, "logits/rejected": -2.9722073078155518, "logps/chosen": -293.91949462890625, "logps/rejected": -339.8097839355469, "loss": 0.1156, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2667938768863678, "rewards/margins": 5.829390048980713, "rewards/rejected": -6.096183776855469, "step": 2410 }, { "epoch": 1.2663526949241235, "grad_norm": 40.901849548303474, "learning_rate": 3.5799062140902413e-07, "logits/chosen": -2.9695992469787598, "logits/rejected": -2.8691701889038086, "logps/chosen": -311.8653869628906, "logps/rejected": -302.96405029296875, "loss": 0.115, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3258931636810303, "rewards/margins": 5.126950263977051, "rewards/rejected": -5.452843189239502, "step": 2420 }, { "epoch": 1.2715855572998431, "grad_norm": 27.015625850620776, "learning_rate": 3.566156018689462e-07, "logits/chosen": -3.0149261951446533, "logits/rejected": -2.79899263381958, "logps/chosen": -273.6623840332031, "logps/rejected": -254.77432250976562, "loss": 0.1253, "rewards/accuracies": 1.0, "rewards/chosen": -1.2927415370941162, "rewards/margins": 4.26215124130249, "rewards/rejected": -5.554893493652344, "step": 2430 }, { "epoch": 1.2768184196755625, "grad_norm": 33.57403454983984, "learning_rate": 3.552366287709038e-07, "logits/chosen": -2.9264512062072754, "logits/rejected": -2.9936349391937256, "logps/chosen": -307.03106689453125, "logps/rejected": -345.5057678222656, "loss": 0.0906, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.39752739667892456, "rewards/margins": 6.203015327453613, "rewards/rejected": -6.6005425453186035, "step": 2440 }, { "epoch": 1.282051282051282, "grad_norm": 48.81356299428572, "learning_rate": 3.5385375325047163e-07, "logits/chosen": -2.9586989879608154, "logits/rejected": -2.9582111835479736, "logps/chosen": -259.7229309082031, "logps/rejected": -297.52398681640625, "loss": 0.0816, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5779643654823303, "rewards/margins": 5.091158390045166, "rewards/rejected": -5.669122219085693, "step": 2450 }, { "epoch": 1.2872841444270016, "grad_norm": 16.728274429679868, "learning_rate": 3.524670265879353e-07, "logits/chosen": -2.963186264038086, "logits/rejected": -2.899862289428711, "logps/chosen": -227.5242156982422, "logps/rejected": -254.6035919189453, "loss": 0.1095, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5539234280586243, "rewards/margins": 4.831884384155273, "rewards/rejected": -5.385807991027832, "step": 2460 }, { "epoch": 1.2925170068027212, "grad_norm": 9.24670068641855, "learning_rate": 3.510765002063901e-07, "logits/chosen": -2.9469804763793945, "logits/rejected": -2.936998128890991, "logps/chosen": -253.8692626953125, "logps/rejected": -324.8531188964844, "loss": 0.0844, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8353679776191711, "rewards/margins": 5.073844909667969, "rewards/rejected": -5.909212589263916, "step": 2470 }, { "epoch": 1.2977498691784406, "grad_norm": 29.10829524372704, "learning_rate": 3.4968222566983367e-07, "logits/chosen": -3.0463242530822754, "logits/rejected": -2.9150710105895996, "logps/chosen": -256.89178466796875, "logps/rejected": -255.098876953125, "loss": 0.1317, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.0427782535552979, "rewards/margins": 4.410581111907959, "rewards/rejected": -5.453359127044678, "step": 2480 }, { "epoch": 1.30298273155416, "grad_norm": 19.643232257292567, "learning_rate": 3.482842546812543e-07, "logits/chosen": -3.018735647201538, "logits/rejected": -2.915055751800537, "logps/chosen": -340.3421325683594, "logps/rejected": -334.6133728027344, "loss": 0.0743, "rewards/accuracies": 1.0, "rewards/chosen": -0.4724584221839905, "rewards/margins": 5.241097927093506, "rewards/rejected": -5.713556289672852, "step": 2490 }, { "epoch": 1.3082155939298796, "grad_norm": 11.683549968360957, "learning_rate": 3.4688263908071307e-07, "logits/chosen": -2.93518328666687, "logits/rejected": -2.8668971061706543, "logps/chosen": -232.9616241455078, "logps/rejected": -271.49127197265625, "loss": 0.1007, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8176862597465515, "rewards/margins": 4.783071517944336, "rewards/rejected": -5.600757598876953, "step": 2500 }, { "epoch": 1.3134484563055993, "grad_norm": 17.311928746279335, "learning_rate": 3.454774308434222e-07, "logits/chosen": -2.9635722637176514, "logits/rejected": -2.9227237701416016, "logps/chosen": -243.4211883544922, "logps/rejected": -348.0363464355469, "loss": 0.0867, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.40717801451683044, "rewards/margins": 5.487346172332764, "rewards/rejected": -5.894524574279785, "step": 2510 }, { "epoch": 1.3186813186813187, "grad_norm": 23.105945810965956, "learning_rate": 3.4406868207781725e-07, "logits/chosen": -2.967461585998535, "logits/rejected": -2.9149880409240723, "logps/chosen": -243.19192504882812, "logps/rejected": -241.4561309814453, "loss": 0.1046, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.30437684059143066, "rewards/margins": 5.423561096191406, "rewards/rejected": -5.727938175201416, "step": 2520 }, { "epoch": 1.323914181057038, "grad_norm": 24.251384889199887, "learning_rate": 3.426564450236249e-07, "logits/chosen": -2.9932351112365723, "logits/rejected": -2.858457326889038, "logps/chosen": -265.11962890625, "logps/rejected": -266.43304443359375, "loss": 0.0964, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9620921015739441, "rewards/margins": 5.041953086853027, "rewards/rejected": -6.004045009613037, "step": 2530 }, { "epoch": 1.3291470434327577, "grad_norm": 107.79278190978536, "learning_rate": 3.4124077204992576e-07, "logits/chosen": -2.8453898429870605, "logits/rejected": -2.8205246925354004, "logps/chosen": -194.77041625976562, "logps/rejected": -279.3961181640625, "loss": 0.114, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.028129959478974342, "rewards/margins": 6.191204071044922, "rewards/rejected": -6.163074493408203, "step": 2540 }, { "epoch": 1.3343799058084773, "grad_norm": 13.559273580038889, "learning_rate": 3.398217156532125e-07, "logits/chosen": -3.0244803428649902, "logits/rejected": -2.917450428009033, "logps/chosen": -288.37860107421875, "logps/rejected": -313.8291015625, "loss": 0.0807, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.47130241990089417, "rewards/margins": 5.818126201629639, "rewards/rejected": -6.2894287109375, "step": 2550 }, { "epoch": 1.3396127681841967, "grad_norm": 15.109708656167296, "learning_rate": 3.383993284554431e-07, "logits/chosen": -2.998115301132202, "logits/rejected": -2.9511008262634277, "logps/chosen": -261.0711669921875, "logps/rejected": -302.484375, "loss": 0.0859, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.46448755264282227, "rewards/margins": 5.904187202453613, "rewards/rejected": -6.3686747550964355, "step": 2560 }, { "epoch": 1.3448456305599163, "grad_norm": 35.260147014871336, "learning_rate": 3.3697366320208955e-07, "logits/chosen": -2.934652090072632, "logits/rejected": -2.8837547302246094, "logps/chosen": -297.14849853515625, "logps/rejected": -319.05987548828125, "loss": 0.0797, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.470604807138443, "rewards/margins": 5.588192462921143, "rewards/rejected": -6.058797359466553, "step": 2570 }, { "epoch": 1.3500784929356358, "grad_norm": 43.17004286789083, "learning_rate": 3.355447727601816e-07, "logits/chosen": -2.917672634124756, "logits/rejected": -2.8100531101226807, "logps/chosen": -260.29156494140625, "logps/rejected": -310.83660888671875, "loss": 0.1092, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8943957090377808, "rewards/margins": 5.425764560699463, "rewards/rejected": -6.320160388946533, "step": 2580 }, { "epoch": 1.3553113553113554, "grad_norm": 33.66713831825435, "learning_rate": 3.3411271011634697e-07, "logits/chosen": -2.935502290725708, "logits/rejected": -2.963707208633423, "logps/chosen": -308.1151428222656, "logps/rejected": -371.8252258300781, "loss": 0.1231, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8715413808822632, "rewards/margins": 5.250386714935303, "rewards/rejected": -6.1219282150268555, "step": 2590 }, { "epoch": 1.3605442176870748, "grad_norm": 27.40014278776782, "learning_rate": 3.3267752837484587e-07, "logits/chosen": -2.906586170196533, "logits/rejected": -2.8765156269073486, "logps/chosen": -235.1721649169922, "logps/rejected": -272.4697265625, "loss": 0.1013, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5821476578712463, "rewards/margins": 4.716443061828613, "rewards/rejected": -5.298590660095215, "step": 2600 }, { "epoch": 1.3657770800627944, "grad_norm": 55.68300444548438, "learning_rate": 3.31239280755602e-07, "logits/chosen": -2.9758920669555664, "logits/rejected": -2.8897440433502197, "logps/chosen": -301.491455078125, "logps/rejected": -302.6290588378906, "loss": 0.1001, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6889128684997559, "rewards/margins": 4.537497520446777, "rewards/rejected": -5.226410865783691, "step": 2610 }, { "epoch": 1.3710099424385138, "grad_norm": 16.111513200501577, "learning_rate": 3.2979802059222936e-07, "logits/chosen": -2.9727206230163574, "logits/rejected": -2.8671393394470215, "logps/chosen": -290.49627685546875, "logps/rejected": -274.0350646972656, "loss": 0.0964, "rewards/accuracies": 1.0, "rewards/chosen": -0.45807400345802307, "rewards/margins": 4.405587196350098, "rewards/rejected": -4.86366081237793, "step": 2620 }, { "epoch": 1.3762428048142334, "grad_norm": 26.28131631460771, "learning_rate": 3.283538013300537e-07, "logits/chosen": -2.8408546447753906, "logits/rejected": -2.849194049835205, "logps/chosen": -219.34017944335938, "logps/rejected": -316.5408020019531, "loss": 0.0874, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6535199880599976, "rewards/margins": 5.10781192779541, "rewards/rejected": -5.761332035064697, "step": 2630 }, { "epoch": 1.3814756671899528, "grad_norm": 18.259198952666996, "learning_rate": 3.269066765241314e-07, "logits/chosen": -2.9982829093933105, "logits/rejected": -2.9487242698669434, "logps/chosen": -278.7436828613281, "logps/rejected": -291.2223815917969, "loss": 0.0822, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9618307948112488, "rewards/margins": 4.594454765319824, "rewards/rejected": -5.556285381317139, "step": 2640 }, { "epoch": 1.3867085295656725, "grad_norm": 36.366621910908954, "learning_rate": 3.254566998372634e-07, "logits/chosen": -2.8699817657470703, "logits/rejected": -2.916234254837036, "logps/chosen": -211.3368377685547, "logps/rejected": -306.00018310546875, "loss": 0.1176, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.0387979745864868, "rewards/margins": 5.514708995819092, "rewards/rejected": -6.553507328033447, "step": 2650 }, { "epoch": 1.3919413919413919, "grad_norm": 29.736135080920207, "learning_rate": 3.2400392503800477e-07, "logits/chosen": -2.950291395187378, "logits/rejected": -2.9478626251220703, "logps/chosen": -304.45733642578125, "logps/rejected": -410.5011291503906, "loss": 0.0844, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.504435658454895, "rewards/margins": 5.371453762054443, "rewards/rejected": -5.875889778137207, "step": 2660 }, { "epoch": 1.3971742543171115, "grad_norm": 12.153794736792438, "learning_rate": 3.225484059986715e-07, "logits/chosen": -2.9667274951934814, "logits/rejected": -2.868407726287842, "logps/chosen": -256.7066650390625, "logps/rejected": -299.7010192871094, "loss": 0.0835, "rewards/accuracies": 1.0, "rewards/chosen": -0.7283914685249329, "rewards/margins": 4.870224952697754, "rewards/rejected": -5.598616123199463, "step": 2670 }, { "epoch": 1.402407116692831, "grad_norm": 34.509030932105254, "learning_rate": 3.2109019669334215e-07, "logits/chosen": -2.903280735015869, "logits/rejected": -2.840728521347046, "logps/chosen": -333.5550842285156, "logps/rejected": -356.84002685546875, "loss": 0.1051, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.48240870237350464, "rewards/margins": 5.71586275100708, "rewards/rejected": -6.198271751403809, "step": 2680 }, { "epoch": 1.4076399790685505, "grad_norm": 19.341058180086034, "learning_rate": 3.19629351195857e-07, "logits/chosen": -2.9245972633361816, "logits/rejected": -2.850804328918457, "logps/chosen": -253.52145385742188, "logps/rejected": -319.34320068359375, "loss": 0.0965, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3571365475654602, "rewards/margins": 5.33168888092041, "rewards/rejected": -5.6888251304626465, "step": 2690 }, { "epoch": 1.41287284144427, "grad_norm": 19.097733259089207, "learning_rate": 3.1816592367781236e-07, "logits/chosen": -2.8811697959899902, "logits/rejected": -2.7609665393829346, "logps/chosen": -315.1808776855469, "logps/rejected": -303.431396484375, "loss": 0.089, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.0400402545928955, "rewards/margins": 4.914624214172363, "rewards/rejected": -5.954664707183838, "step": 2700 }, { "epoch": 1.4181057038199896, "grad_norm": 40.606766761513114, "learning_rate": 3.166999684065521e-07, "logits/chosen": -2.9121406078338623, "logits/rejected": -2.821132183074951, "logps/chosen": -253.940673828125, "logps/rejected": -273.3531799316406, "loss": 0.1109, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8028062582015991, "rewards/margins": 4.605751991271973, "rewards/rejected": -5.408558368682861, "step": 2710 }, { "epoch": 1.423338566195709, "grad_norm": 21.600944792401865, "learning_rate": 3.1523153974315497e-07, "logits/chosen": -2.928520441055298, "logits/rejected": -2.910478115081787, "logps/chosen": -265.21697998046875, "logps/rejected": -301.0665588378906, "loss": 0.1211, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.5320097208023071, "rewards/margins": 5.225158214569092, "rewards/rejected": -5.757167339324951, "step": 2720 }, { "epoch": 1.4285714285714286, "grad_norm": 70.07105032870118, "learning_rate": 3.137606921404191e-07, "logits/chosen": -2.8750851154327393, "logits/rejected": -2.8132991790771484, "logps/chosen": -278.92315673828125, "logps/rejected": -272.9971923828125, "loss": 0.1503, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.8762138485908508, "rewards/margins": 4.639349460601807, "rewards/rejected": -5.515563011169434, "step": 2730 }, { "epoch": 1.433804290947148, "grad_norm": 35.488292288790966, "learning_rate": 3.1228748014084243e-07, "logits/chosen": -2.7600841522216797, "logits/rejected": -2.734055280685425, "logps/chosen": -281.13623046875, "logps/rejected": -294.22857666015625, "loss": 0.1084, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.576066792011261, "rewards/margins": 4.5645751953125, "rewards/rejected": -5.140642166137695, "step": 2740 }, { "epoch": 1.4390371533228676, "grad_norm": 8.524775376088275, "learning_rate": 3.108119583746005e-07, "logits/chosen": -2.8350164890289307, "logits/rejected": -2.825906753540039, "logps/chosen": -229.21047973632812, "logps/rejected": -290.7459716796875, "loss": 0.1212, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.04539037495851517, "rewards/margins": 5.1288251876831055, "rewards/rejected": -5.174216270446777, "step": 2750 }, { "epoch": 1.4442700156985873, "grad_norm": 26.98649449101546, "learning_rate": 3.093341815575202e-07, "logits/chosen": -2.8958609104156494, "logits/rejected": -2.82767915725708, "logps/chosen": -269.95672607421875, "logps/rejected": -245.73471069335938, "loss": 0.0853, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.39714497327804565, "rewards/margins": 4.872426986694336, "rewards/rejected": -5.269571781158447, "step": 2760 }, { "epoch": 1.4495028780743067, "grad_norm": 13.857584693435655, "learning_rate": 3.078542044890513e-07, "logits/chosen": -2.9566783905029297, "logits/rejected": -2.816641330718994, "logps/chosen": -321.45062255859375, "logps/rejected": -345.4382629394531, "loss": 0.1073, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4065060019493103, "rewards/margins": 5.966509819030762, "rewards/rejected": -6.373015403747559, "step": 2770 }, { "epoch": 1.454735740450026, "grad_norm": 22.158288946250593, "learning_rate": 3.0637208205023386e-07, "logits/chosen": -3.006734848022461, "logits/rejected": -2.8882689476013184, "logps/chosen": -303.35723876953125, "logps/rejected": -281.02789306640625, "loss": 0.1198, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.48174524307250977, "rewards/margins": 4.43752908706665, "rewards/rejected": -4.91927433013916, "step": 2780 }, { "epoch": 1.4599686028257457, "grad_norm": 16.35581141301091, "learning_rate": 3.0488786920166343e-07, "logits/chosen": -2.906308650970459, "logits/rejected": -2.948639392852783, "logps/chosen": -296.57342529296875, "logps/rejected": -368.0602722167969, "loss": 0.0851, "rewards/accuracies": 1.0, "rewards/chosen": -0.42739176750183105, "rewards/margins": 5.575040817260742, "rewards/rejected": -6.002431869506836, "step": 2790 }, { "epoch": 1.4652014652014653, "grad_norm": 36.482679085834555, "learning_rate": 3.034016209814529e-07, "logits/chosen": -2.930183172225952, "logits/rejected": -2.895362615585327, "logps/chosen": -260.2642517089844, "logps/rejected": -305.01275634765625, "loss": 0.116, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7408644556999207, "rewards/margins": 5.248217582702637, "rewards/rejected": -5.989082336425781, "step": 2800 }, { "epoch": 1.4704343275771847, "grad_norm": 23.697514691581986, "learning_rate": 3.0191339250319147e-07, "logits/chosen": -2.9409375190734863, "logits/rejected": -2.9658925533294678, "logps/chosen": -279.1907958984375, "logps/rejected": -357.61981201171875, "loss": 0.0702, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.04058970883488655, "rewards/margins": 5.941542625427246, "rewards/rejected": -5.9821319580078125, "step": 2810 }, { "epoch": 1.4756671899529041, "grad_norm": 11.042589224075924, "learning_rate": 3.004232389539011e-07, "logits/chosen": -2.992539405822754, "logits/rejected": -2.960721492767334, "logps/chosen": -268.1466979980469, "logps/rejected": -323.2642517089844, "loss": 0.0767, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4720631539821625, "rewards/margins": 6.277966499328613, "rewards/rejected": -6.750029563903809, "step": 2820 }, { "epoch": 1.4809000523286238, "grad_norm": 6.476873082677127, "learning_rate": 2.989312155919898e-07, "logits/chosen": -2.938168525695801, "logits/rejected": -2.870612859725952, "logps/chosen": -256.95855712890625, "logps/rejected": -326.80523681640625, "loss": 0.0788, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5215288996696472, "rewards/margins": 5.030642509460449, "rewards/rejected": -5.552170753479004, "step": 2830 }, { "epoch": 1.4861329147043434, "grad_norm": 26.81568647520585, "learning_rate": 2.9743737774520266e-07, "logits/chosen": -2.958479881286621, "logits/rejected": -2.932797908782959, "logps/chosen": -266.9199523925781, "logps/rejected": -320.0476379394531, "loss": 0.1228, "rewards/accuracies": 1.0, "rewards/chosen": -0.05112558603286743, "rewards/margins": 6.116800308227539, "rewards/rejected": -6.167925834655762, "step": 2840 }, { "epoch": 1.4913657770800628, "grad_norm": 21.719580832629315, "learning_rate": 2.959417808085702e-07, "logits/chosen": -2.9039673805236816, "logits/rejected": -2.9286551475524902, "logps/chosen": -220.92080688476562, "logps/rejected": -270.14959716796875, "loss": 0.0997, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9624674916267395, "rewards/margins": 5.130197048187256, "rewards/rejected": -6.09266471862793, "step": 2850 }, { "epoch": 1.4965986394557822, "grad_norm": 30.21390770999237, "learning_rate": 2.944444802423542e-07, "logits/chosen": -3.0419020652770996, "logits/rejected": -2.9683642387390137, "logps/chosen": -308.25799560546875, "logps/rejected": -375.5798645019531, "loss": 0.1022, "rewards/accuracies": 1.0, "rewards/chosen": -0.23521308600902557, "rewards/margins": 6.550291538238525, "rewards/rejected": -6.7855048179626465, "step": 2860 }, { "epoch": 1.5018315018315018, "grad_norm": 21.882979849226242, "learning_rate": 2.929455315699908e-07, "logits/chosen": -2.9220502376556396, "logits/rejected": -2.7800869941711426, "logps/chosen": -300.79278564453125, "logps/rejected": -354.15301513671875, "loss": 0.0972, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.29402869939804077, "rewards/margins": 6.519808769226074, "rewards/rejected": -6.81383752822876, "step": 2870 }, { "epoch": 1.5070643642072215, "grad_norm": 31.653884418139643, "learning_rate": 2.9144499037603204e-07, "logits/chosen": -3.00227689743042, "logits/rejected": -2.9132020473480225, "logps/chosen": -255.8629913330078, "logps/rejected": -290.1116943359375, "loss": 0.1057, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8617574572563171, "rewards/margins": 5.381239414215088, "rewards/rejected": -6.242997169494629, "step": 2880 }, { "epoch": 1.5122972265829409, "grad_norm": 34.849118269582725, "learning_rate": 2.899429123040843e-07, "logits/chosen": -3.0018563270568848, "logits/rejected": -2.970979928970337, "logps/chosen": -255.91732788085938, "logps/rejected": -318.36529541015625, "loss": 0.0944, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9821944236755371, "rewards/margins": 5.042538642883301, "rewards/rejected": -6.02473258972168, "step": 2890 }, { "epoch": 1.5175300889586603, "grad_norm": 46.54071762632803, "learning_rate": 2.884393530547452e-07, "logits/chosen": -3.0814383029937744, "logits/rejected": -2.992365837097168, "logps/chosen": -294.671630859375, "logps/rejected": -331.76922607421875, "loss": 0.0997, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.21671578288078308, "rewards/margins": 5.064117431640625, "rewards/rejected": -5.280832290649414, "step": 2900 }, { "epoch": 1.5227629513343799, "grad_norm": 26.870441053208463, "learning_rate": 2.869343683835376e-07, "logits/chosen": -2.988356113433838, "logits/rejected": -2.904080867767334, "logps/chosen": -237.4556427001953, "logps/rejected": -339.13824462890625, "loss": 0.1059, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7436215877532959, "rewards/margins": 5.817732334136963, "rewards/rejected": -6.5613532066345215, "step": 2910 }, { "epoch": 1.5279958137100995, "grad_norm": 25.170930077929835, "learning_rate": 2.8542801409884253e-07, "logits/chosen": -3.030796527862549, "logits/rejected": -2.96657657623291, "logps/chosen": -323.8079833984375, "logps/rejected": -369.0687561035156, "loss": 0.0628, "rewards/accuracies": 0.875, "rewards/chosen": -0.8993136286735535, "rewards/margins": 4.974839210510254, "rewards/rejected": -5.874153137207031, "step": 2920 }, { "epoch": 1.533228676085819, "grad_norm": 39.54768296664164, "learning_rate": 2.839203460598297e-07, "logits/chosen": -3.0432751178741455, "logits/rejected": -3.0220627784729004, "logps/chosen": -344.61285400390625, "logps/rejected": -379.151123046875, "loss": 0.1097, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.7961816787719727, "rewards/margins": 5.541321277618408, "rewards/rejected": -6.337502479553223, "step": 2930 }, { "epoch": 1.5384615384615383, "grad_norm": 25.530340291257225, "learning_rate": 2.8241142017438557e-07, "logits/chosen": -3.0318961143493652, "logits/rejected": -3.0056843757629395, "logps/chosen": -310.2491149902344, "logps/rejected": -329.64678955078125, "loss": 0.0977, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0655846819281578, "rewards/margins": 5.999459266662598, "rewards/rejected": -6.0650434494018555, "step": 2940 }, { "epoch": 1.543694400837258, "grad_norm": 41.24978735485262, "learning_rate": 2.8090129239704083e-07, "logits/chosen": -2.983940601348877, "logits/rejected": -2.8969976902008057, "logps/chosen": -299.1850891113281, "logps/rejected": -264.98480224609375, "loss": 0.1346, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.2126175165176392, "rewards/margins": 4.951140403747559, "rewards/rejected": -6.16375732421875, "step": 2950 }, { "epoch": 1.5489272632129776, "grad_norm": 17.27780090415538, "learning_rate": 2.7939001872689496e-07, "logits/chosen": -2.9182181358337402, "logits/rejected": -2.8673245906829834, "logps/chosen": -207.96743774414062, "logps/rejected": -240.6229248046875, "loss": 0.0981, "rewards/accuracies": 0.875, "rewards/chosen": -1.0927234888076782, "rewards/margins": 4.25023889541626, "rewards/rejected": -5.342962741851807, "step": 2960 }, { "epoch": 1.554160125588697, "grad_norm": 50.52945720142169, "learning_rate": 2.778776552055398e-07, "logits/chosen": -2.9225635528564453, "logits/rejected": -2.7944629192352295, "logps/chosen": -299.3099060058594, "logps/rejected": -305.2068176269531, "loss": 0.088, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8133533596992493, "rewards/margins": 5.068005561828613, "rewards/rejected": -5.8813581466674805, "step": 2970 }, { "epoch": 1.5593929879644164, "grad_norm": 23.300585208783655, "learning_rate": 2.763642579149817e-07, "logits/chosen": -2.876394033432007, "logits/rejected": -2.867790699005127, "logps/chosen": -245.3839874267578, "logps/rejected": -303.235595703125, "loss": 0.0967, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.5917602777481079, "rewards/margins": 4.809393405914307, "rewards/rejected": -5.401154041290283, "step": 2980 }, { "epoch": 1.564625850340136, "grad_norm": 56.539161921429496, "learning_rate": 2.748498829755615e-07, "logits/chosen": -2.895934581756592, "logits/rejected": -2.854820728302002, "logps/chosen": -255.66207885742188, "logps/rejected": -362.1599426269531, "loss": 0.0778, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.18918538093566895, "rewards/margins": 6.055611610412598, "rewards/rejected": -6.2447967529296875, "step": 2990 }, { "epoch": 1.5698587127158556, "grad_norm": 14.290114332565253, "learning_rate": 2.7333458654387344e-07, "logits/chosen": -2.9630067348480225, "logits/rejected": -2.9416165351867676, "logps/chosen": -293.8984375, "logps/rejected": -316.8224182128906, "loss": 0.0821, "rewards/accuracies": 1.0, "rewards/chosen": -0.13744406402111053, "rewards/margins": 5.585749626159668, "rewards/rejected": -5.723193168640137, "step": 3000 }, { "epoch": 1.575091575091575, "grad_norm": 21.14570422957965, "learning_rate": 2.718184248106828e-07, "logits/chosen": -3.051680326461792, "logits/rejected": -2.9450526237487793, "logps/chosen": -331.4350891113281, "logps/rejected": -368.21636962890625, "loss": 0.0831, "rewards/accuracies": 1.0, "rewards/chosen": -0.11088670790195465, "rewards/margins": 6.121740341186523, "rewards/rejected": -6.2326273918151855, "step": 3010 }, { "epoch": 1.5803244374672945, "grad_norm": 36.17733868879991, "learning_rate": 2.7030145399884275e-07, "logits/chosen": -2.9764156341552734, "logits/rejected": -2.8698744773864746, "logps/chosen": -359.0478515625, "logps/rejected": -355.5354309082031, "loss": 0.094, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.811800479888916, "rewards/margins": 5.112343788146973, "rewards/rejected": -5.9241437911987305, "step": 3020 }, { "epoch": 1.585557299843014, "grad_norm": 19.375449056619342, "learning_rate": 2.687837303612085e-07, "logits/chosen": -3.025543451309204, "logits/rejected": -2.9200291633605957, "logps/chosen": -338.224853515625, "logps/rejected": -366.6781311035156, "loss": 0.1101, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4581005573272705, "rewards/margins": 6.052212715148926, "rewards/rejected": -6.510312557220459, "step": 3030 }, { "epoch": 1.5907901622187337, "grad_norm": 40.570488375342656, "learning_rate": 2.672653101785519e-07, "logits/chosen": -2.8993771076202393, "logits/rejected": -2.878880262374878, "logps/chosen": -298.4730224609375, "logps/rejected": -344.69573974609375, "loss": 0.0846, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.36513882875442505, "rewards/margins": 5.7705979347229, "rewards/rejected": -6.13573694229126, "step": 3040 }, { "epoch": 1.5960230245944533, "grad_norm": 33.93109224773888, "learning_rate": 2.657462497574747e-07, "logits/chosen": -2.992415428161621, "logits/rejected": -2.9906678199768066, "logps/chosen": -233.6267547607422, "logps/rejected": -278.3908996582031, "loss": 0.0737, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.672626793384552, "rewards/margins": 4.652403831481934, "rewards/rejected": -5.32503080368042, "step": 3050 }, { "epoch": 1.6012558869701727, "grad_norm": 7.079706616490647, "learning_rate": 2.642266054283198e-07, "logits/chosen": -3.0305943489074707, "logits/rejected": -2.868263006210327, "logps/chosen": -357.4576721191406, "logps/rejected": -280.21820068359375, "loss": 0.087, "rewards/accuracies": 1.0, "rewards/chosen": -0.14905306696891785, "rewards/margins": 6.067292213439941, "rewards/rejected": -6.216345310211182, "step": 3060 }, { "epoch": 1.6064887493458921, "grad_norm": 25.727932089626385, "learning_rate": 2.627064335430829e-07, "logits/chosen": -2.9717965126037598, "logits/rejected": -2.867175579071045, "logps/chosen": -312.16046142578125, "logps/rejected": -330.35260009765625, "loss": 0.0767, "rewards/accuracies": 1.0, "rewards/chosen": -0.513081967830658, "rewards/margins": 5.674849987030029, "rewards/rejected": -6.187932014465332, "step": 3070 }, { "epoch": 1.6117216117216118, "grad_norm": 42.05611133530728, "learning_rate": 2.611857904733227e-07, "logits/chosen": -2.963825225830078, "logits/rejected": -2.8314619064331055, "logps/chosen": -303.1193542480469, "logps/rejected": -297.5630798339844, "loss": 0.0952, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9275169372558594, "rewards/margins": 5.41394567489624, "rewards/rejected": -6.341462135314941, "step": 3080 }, { "epoch": 1.6169544740973314, "grad_norm": 27.310098572105726, "learning_rate": 2.5966473260807076e-07, "logits/chosen": -3.0115184783935547, "logits/rejected": -2.9206669330596924, "logps/chosen": -342.8286437988281, "logps/rejected": -386.7396545410156, "loss": 0.0783, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.08905807882547379, "rewards/margins": 6.670069217681885, "rewards/rejected": -6.759127140045166, "step": 3090 }, { "epoch": 1.6221873364730508, "grad_norm": 32.491733301952486, "learning_rate": 2.5814331635173987e-07, "logits/chosen": -2.9356818199157715, "logits/rejected": -2.8898653984069824, "logps/chosen": -302.5807189941406, "logps/rejected": -346.0173034667969, "loss": 0.1289, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.47722846269607544, "rewards/margins": 4.573758125305176, "rewards/rejected": -5.050986289978027, "step": 3100 }, { "epoch": 1.6274201988487702, "grad_norm": 49.53738312565812, "learning_rate": 2.566215981220331e-07, "logits/chosen": -2.8828892707824707, "logits/rejected": -2.8117661476135254, "logps/chosen": -302.1572265625, "logps/rejected": -353.1871643066406, "loss": 0.1078, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.1469515562057495, "rewards/margins": 4.853428840637207, "rewards/rejected": -6.000380516052246, "step": 3110 }, { "epoch": 1.6326530612244898, "grad_norm": 32.702914519446885, "learning_rate": 2.550996343478514e-07, "logits/chosen": -2.9141077995300293, "logits/rejected": -2.8881983757019043, "logps/chosen": -295.5565490722656, "logps/rejected": -338.3243103027344, "loss": 0.1177, "rewards/accuracies": 1.0, "rewards/chosen": -0.19302958250045776, "rewards/margins": 6.2570013999938965, "rewards/rejected": -6.450031280517578, "step": 3120 }, { "epoch": 1.6378859236002095, "grad_norm": 24.077426211372806, "learning_rate": 2.5357748146720076e-07, "logits/chosen": -2.914943218231201, "logits/rejected": -2.7965197563171387, "logps/chosen": -205.087646484375, "logps/rejected": -254.46963500976562, "loss": 0.0789, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7729593515396118, "rewards/margins": 4.912353515625, "rewards/rejected": -5.6853132247924805, "step": 3130 }, { "epoch": 1.6431187859759289, "grad_norm": 28.91850929895344, "learning_rate": 2.5205519592509993e-07, "logits/chosen": -2.9161150455474854, "logits/rejected": -2.8340632915496826, "logps/chosen": -267.1830749511719, "logps/rejected": -312.8226623535156, "loss": 0.1023, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8677878379821777, "rewards/margins": 5.543097972869873, "rewards/rejected": -6.410886287689209, "step": 3140 }, { "epoch": 1.6483516483516483, "grad_norm": 29.871922217911525, "learning_rate": 2.505328341714873e-07, "logits/chosen": -3.0294668674468994, "logits/rejected": -2.877382755279541, "logps/chosen": -311.2955322265625, "logps/rejected": -339.3755798339844, "loss": 0.0842, "rewards/accuracies": 1.0, "rewards/chosen": -0.32007771730422974, "rewards/margins": 6.197929382324219, "rewards/rejected": -6.518006801605225, "step": 3150 }, { "epoch": 1.653584510727368, "grad_norm": 32.499835233179134, "learning_rate": 2.4901045265912687e-07, "logits/chosen": -2.9892146587371826, "logits/rejected": -2.9267399311065674, "logps/chosen": -307.1473693847656, "logps/rejected": -369.3219299316406, "loss": 0.0978, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.25305670499801636, "rewards/margins": 6.500479698181152, "rewards/rejected": -6.753536224365234, "step": 3160 }, { "epoch": 1.6588173731030875, "grad_norm": 14.677736448239816, "learning_rate": 2.4748810784151555e-07, "logits/chosen": -2.9640157222747803, "logits/rejected": -2.928230047225952, "logps/chosen": -325.80535888671875, "logps/rejected": -312.01239013671875, "loss": 0.0769, "rewards/accuracies": 1.0, "rewards/chosen": -0.8766148686408997, "rewards/margins": 5.40153169631958, "rewards/rejected": -6.278146266937256, "step": 3170 }, { "epoch": 1.664050235478807, "grad_norm": 49.30642278846383, "learning_rate": 2.459658561707898e-07, "logits/chosen": -2.9555201530456543, "logits/rejected": -2.8968873023986816, "logps/chosen": -311.941650390625, "logps/rejected": -355.16729736328125, "loss": 0.0993, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.7644907236099243, "rewards/margins": 5.166899681091309, "rewards/rejected": -5.931390285491943, "step": 3180 }, { "epoch": 1.6692830978545263, "grad_norm": 17.062819227980665, "learning_rate": 2.4444375409563145e-07, "logits/chosen": -2.9882798194885254, "logits/rejected": -2.884182929992676, "logps/chosen": -307.93035888671875, "logps/rejected": -334.9039611816406, "loss": 0.0804, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.0185712575912476, "rewards/margins": 5.780345916748047, "rewards/rejected": -6.798916816711426, "step": 3190 }, { "epoch": 1.674515960230246, "grad_norm": 42.281130152900296, "learning_rate": 2.429218580591753e-07, "logits/chosen": -2.8452441692352295, "logits/rejected": -2.7602384090423584, "logps/chosen": -320.68359375, "logps/rejected": -298.3215026855469, "loss": 0.1154, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5944861173629761, "rewards/margins": 6.031280517578125, "rewards/rejected": -6.625766754150391, "step": 3200 }, { "epoch": 1.6797488226059656, "grad_norm": 54.203353531756704, "learning_rate": 2.414002244969158e-07, "logits/chosen": -2.8465254306793213, "logits/rejected": -2.816558361053467, "logps/chosen": -283.3945007324219, "logps/rejected": -333.31781005859375, "loss": 0.0977, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3947932720184326, "rewards/margins": 6.270691871643066, "rewards/rejected": -7.665485382080078, "step": 3210 }, { "epoch": 1.684981684981685, "grad_norm": 20.98359357019587, "learning_rate": 2.3987890983461403e-07, "logits/chosen": -2.9846692085266113, "logits/rejected": -2.8959670066833496, "logps/chosen": -313.11627197265625, "logps/rejected": -381.42254638671875, "loss": 0.0979, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8697175979614258, "rewards/margins": 5.869936943054199, "rewards/rejected": -6.739654541015625, "step": 3220 }, { "epoch": 1.6902145473574044, "grad_norm": 21.207181568501465, "learning_rate": 2.3835797048620564e-07, "logits/chosen": -2.9960780143737793, "logits/rejected": -2.9413774013519287, "logps/chosen": -286.8423156738281, "logps/rejected": -302.71209716796875, "loss": 0.1029, "rewards/accuracies": 1.0, "rewards/chosen": -0.8179828524589539, "rewards/margins": 5.232959270477295, "rewards/rejected": -6.050942420959473, "step": 3230 }, { "epoch": 1.695447409733124, "grad_norm": 47.05738660280381, "learning_rate": 2.368374628517088e-07, "logits/chosen": -2.837822675704956, "logits/rejected": -2.7839956283569336, "logps/chosen": -289.78485107421875, "logps/rejected": -316.15740966796875, "loss": 0.115, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6605669856071472, "rewards/margins": 5.9773759841918945, "rewards/rejected": -6.637942314147949, "step": 3240 }, { "epoch": 1.7006802721088436, "grad_norm": 7.618964512195328, "learning_rate": 2.3531744331513247e-07, "logits/chosen": -2.8787481784820557, "logits/rejected": -2.899895191192627, "logps/chosen": -238.6513214111328, "logps/rejected": -306.4695739746094, "loss": 0.0865, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7578551173210144, "rewards/margins": 5.690915584564209, "rewards/rejected": -6.448770046234131, "step": 3250 }, { "epoch": 1.705913134484563, "grad_norm": 69.86134605409512, "learning_rate": 2.3379796824238608e-07, "logits/chosen": -2.8772690296173096, "logits/rejected": -2.8589038848876953, "logps/chosen": -229.28482055664062, "logps/rejected": -260.36181640625, "loss": 0.1449, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.6403954029083252, "rewards/margins": 4.701403617858887, "rewards/rejected": -6.341798305511475, "step": 3260 }, { "epoch": 1.7111459968602825, "grad_norm": 16.142153894827, "learning_rate": 2.3227909397918894e-07, "logits/chosen": -3.0986921787261963, "logits/rejected": -3.0192179679870605, "logps/chosen": -329.4295654296875, "logps/rejected": -384.4919128417969, "loss": 0.0725, "rewards/accuracies": 1.0, "rewards/chosen": -0.39285629987716675, "rewards/margins": 6.682240962982178, "rewards/rejected": -7.07509708404541, "step": 3270 }, { "epoch": 1.716378859236002, "grad_norm": 33.00479175346524, "learning_rate": 2.3076087684898076e-07, "logits/chosen": -2.964064836502075, "logits/rejected": -2.8627841472625732, "logps/chosen": -271.48858642578125, "logps/rejected": -330.54461669921875, "loss": 0.1073, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9865821003913879, "rewards/margins": 5.312698841094971, "rewards/rejected": -6.299281120300293, "step": 3280 }, { "epoch": 1.7216117216117217, "grad_norm": 57.26343974965987, "learning_rate": 2.2924337315083353e-07, "logits/chosen": -2.988205909729004, "logits/rejected": -2.886157274246216, "logps/chosen": -368.3540954589844, "logps/rejected": -380.71881103515625, "loss": 0.0731, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3264088034629822, "rewards/margins": 5.783016681671143, "rewards/rejected": -6.1094255447387695, "step": 3290 }, { "epoch": 1.7268445839874411, "grad_norm": 31.206598153415626, "learning_rate": 2.277266391573633e-07, "logits/chosen": -2.967082977294922, "logits/rejected": -2.9385037422180176, "logps/chosen": -325.8502502441406, "logps/rejected": -333.3088684082031, "loss": 0.0828, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5762299299240112, "rewards/margins": 7.629581451416016, "rewards/rejected": -7.053351402282715, "step": 3300 }, { "epoch": 1.7320774463631605, "grad_norm": 64.27978297954681, "learning_rate": 2.2621073111264357e-07, "logits/chosen": -2.851121664047241, "logits/rejected": -2.8555855751037598, "logps/chosen": -268.4470520019531, "logps/rejected": -283.6726379394531, "loss": 0.0921, "rewards/accuracies": 1.0, "rewards/chosen": -0.6114974021911621, "rewards/margins": 5.214678764343262, "rewards/rejected": -5.826176643371582, "step": 3310 }, { "epoch": 1.7373103087388801, "grad_norm": 26.02484170235342, "learning_rate": 2.2469570523011993e-07, "logits/chosen": -2.885988235473633, "logits/rejected": -2.895848274230957, "logps/chosen": -264.5588684082031, "logps/rejected": -316.19061279296875, "loss": 0.0937, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.045731544494629, "rewards/margins": 5.0829010009765625, "rewards/rejected": -6.128632545471191, "step": 3320 }, { "epoch": 1.7425431711145998, "grad_norm": 4.750316064133009, "learning_rate": 2.2318161769052525e-07, "logits/chosen": -2.949136257171631, "logits/rejected": -2.8640103340148926, "logps/chosen": -271.70391845703125, "logps/rejected": -336.8128356933594, "loss": 0.1005, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8743613958358765, "rewards/margins": 5.895020484924316, "rewards/rejected": -6.769381046295166, "step": 3330 }, { "epoch": 1.7477760334903192, "grad_norm": 11.805070754116622, "learning_rate": 2.2166852463979624e-07, "logits/chosen": -2.8649890422821045, "logits/rejected": -2.790555953979492, "logps/chosen": -262.340576171875, "logps/rejected": -276.14508056640625, "loss": 0.1141, "rewards/accuracies": 1.0, "rewards/chosen": -0.8610191345214844, "rewards/margins": 5.371983051300049, "rewards/rejected": -6.233001708984375, "step": 3340 }, { "epoch": 1.7530088958660386, "grad_norm": 13.21397712792492, "learning_rate": 2.20156482186992e-07, "logits/chosen": -2.890495777130127, "logits/rejected": -2.8718886375427246, "logps/chosen": -272.59881591796875, "logps/rejected": -327.79229736328125, "loss": 0.1008, "rewards/accuracies": 1.0, "rewards/chosen": -0.6845839619636536, "rewards/margins": 5.636344909667969, "rewards/rejected": -6.320928573608398, "step": 3350 }, { "epoch": 1.7582417582417582, "grad_norm": 20.12743667827297, "learning_rate": 2.1864554640221244e-07, "logits/chosen": -2.807590961456299, "logits/rejected": -2.851696491241455, "logps/chosen": -211.4408721923828, "logps/rejected": -335.7973937988281, "loss": 0.0949, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.2595535516738892, "rewards/margins": 6.21151065826416, "rewards/rejected": -7.471064567565918, "step": 3360 }, { "epoch": 1.7634746206174778, "grad_norm": 8.735385443022608, "learning_rate": 2.1713577331452016e-07, "logits/chosen": -3.0013585090637207, "logits/rejected": -2.9123120307922363, "logps/chosen": -275.77886962890625, "logps/rejected": -291.35003662109375, "loss": 0.0866, "rewards/accuracies": 1.0, "rewards/chosen": -0.6517156958580017, "rewards/margins": 5.467103958129883, "rewards/rejected": -6.1188201904296875, "step": 3370 }, { "epoch": 1.7687074829931972, "grad_norm": 40.041296329924606, "learning_rate": 2.1562721890986199e-07, "logits/chosen": -2.887263774871826, "logits/rejected": -2.7689013481140137, "logps/chosen": -252.7852325439453, "logps/rejected": -268.9713439941406, "loss": 0.0958, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.098160982131958, "rewards/margins": 5.5457763671875, "rewards/rejected": -6.643937110900879, "step": 3380 }, { "epoch": 1.7739403453689166, "grad_norm": 8.039764847967916, "learning_rate": 2.1411993912899285e-07, "logits/chosen": -2.889521837234497, "logits/rejected": -2.9703779220581055, "logps/chosen": -252.558349609375, "logps/rejected": -386.41705322265625, "loss": 0.0995, "rewards/accuracies": 1.0, "rewards/chosen": -0.9323278665542603, "rewards/margins": 5.271166801452637, "rewards/rejected": -6.203495025634766, "step": 3390 }, { "epoch": 1.7791732077446363, "grad_norm": 39.24923233977659, "learning_rate": 2.126139898654021e-07, "logits/chosen": -2.883772611618042, "logits/rejected": -2.8640620708465576, "logps/chosen": -246.9475555419922, "logps/rejected": -304.64251708984375, "loss": 0.0988, "rewards/accuracies": 1.0, "rewards/chosen": -1.5902787446975708, "rewards/margins": 4.75054931640625, "rewards/rejected": -6.340827941894531, "step": 3400 }, { "epoch": 1.784406070120356, "grad_norm": 21.840692270294728, "learning_rate": 2.1110942696324012e-07, "logits/chosen": -3.0871424674987793, "logits/rejected": -3.006291151046753, "logps/chosen": -327.09423828125, "logps/rejected": -337.6763610839844, "loss": 0.1235, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5916340947151184, "rewards/margins": 5.287174701690674, "rewards/rejected": -5.878809928894043, "step": 3410 }, { "epoch": 1.7896389324960753, "grad_norm": 38.269884231852245, "learning_rate": 2.0960630621524762e-07, "logits/chosen": -2.8983089923858643, "logits/rejected": -2.8440237045288086, "logps/chosen": -318.80224609375, "logps/rejected": -290.29119873046875, "loss": 0.1027, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.36514362692832947, "rewards/margins": 6.031256675720215, "rewards/rejected": -6.396399974822998, "step": 3420 }, { "epoch": 1.7948717948717947, "grad_norm": 26.400177939321953, "learning_rate": 2.0810468336068697e-07, "logits/chosen": -2.9065942764282227, "logits/rejected": -2.9462809562683105, "logps/chosen": -249.7903289794922, "logps/rejected": -306.42303466796875, "loss": 0.0903, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.120842695236206, "rewards/margins": 5.753670692443848, "rewards/rejected": -6.874513149261475, "step": 3430 }, { "epoch": 1.8001046572475143, "grad_norm": 39.097512382355475, "learning_rate": 2.0660461408327535e-07, "logits/chosen": -3.0207207202911377, "logits/rejected": -2.9332714080810547, "logps/chosen": -302.96771240234375, "logps/rejected": -285.3617248535156, "loss": 0.0741, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5401545763015747, "rewards/margins": 5.207390308380127, "rewards/rejected": -5.747544765472412, "step": 3440 }, { "epoch": 1.805337519623234, "grad_norm": 15.60830504919868, "learning_rate": 2.0510615400911906e-07, "logits/chosen": -3.047731399536133, "logits/rejected": -2.9956910610198975, "logps/chosen": -280.5403747558594, "logps/rejected": -290.47357177734375, "loss": 0.1027, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.2553006410598755, "rewards/margins": 5.412297248840332, "rewards/rejected": -5.66759729385376, "step": 3450 }, { "epoch": 1.8105703819989536, "grad_norm": 28.338407000411944, "learning_rate": 2.0360935870465185e-07, "logits/chosen": -2.984647750854492, "logits/rejected": -2.8625540733337402, "logps/chosen": -340.5264892578125, "logps/rejected": -337.3761291503906, "loss": 0.0821, "rewards/accuracies": 1.0, "rewards/chosen": 0.17096763849258423, "rewards/margins": 5.962976455688477, "rewards/rejected": -5.792008399963379, "step": 3460 }, { "epoch": 1.815803244374673, "grad_norm": 55.714472737909716, "learning_rate": 2.021142836745739e-07, "logits/chosen": -2.9354255199432373, "logits/rejected": -2.861781358718872, "logps/chosen": -287.35028076171875, "logps/rejected": -312.83319091796875, "loss": 0.0997, "rewards/accuracies": 1.0, "rewards/chosen": -0.6262660622596741, "rewards/margins": 5.136072158813477, "rewards/rejected": -5.762337684631348, "step": 3470 }, { "epoch": 1.8210361067503924, "grad_norm": 45.02822639621488, "learning_rate": 2.0062098435979308e-07, "logits/chosen": -2.817512273788452, "logits/rejected": -2.794673204421997, "logps/chosen": -307.40716552734375, "logps/rejected": -294.8499450683594, "loss": 0.1064, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.026535153388977, "rewards/margins": 4.410679817199707, "rewards/rejected": -5.4372148513793945, "step": 3480 }, { "epoch": 1.826268969126112, "grad_norm": 12.449003154650825, "learning_rate": 1.9912951613536997e-07, "logits/chosen": -2.990734577178955, "logits/rejected": -2.887299060821533, "logps/chosen": -299.7632141113281, "logps/rejected": -290.27313232421875, "loss": 0.0878, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6649189591407776, "rewards/margins": 5.5867228507995605, "rewards/rejected": -6.25164270401001, "step": 3490 }, { "epoch": 1.8315018315018317, "grad_norm": 45.474732124176825, "learning_rate": 1.9763993430846392e-07, "logits/chosen": -3.0048184394836426, "logits/rejected": -2.83622145652771, "logps/chosen": -274.488037109375, "logps/rejected": -251.3792266845703, "loss": 0.097, "rewards/accuracies": 1.0, "rewards/chosen": -0.8459029197692871, "rewards/margins": 5.043917655944824, "rewards/rejected": -5.889820575714111, "step": 3500 }, { "epoch": 1.836734693877551, "grad_norm": 10.492545544088514, "learning_rate": 1.9615229411628212e-07, "logits/chosen": -2.8946971893310547, "logits/rejected": -2.877978801727295, "logps/chosen": -210.0872344970703, "logps/rejected": -320.6335754394531, "loss": 0.0876, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1554878950119019, "rewards/margins": 5.252488136291504, "rewards/rejected": -6.4079766273498535, "step": 3510 }, { "epoch": 1.8419675562532705, "grad_norm": 12.96450345293325, "learning_rate": 1.946666507240314e-07, "logits/chosen": -2.9569857120513916, "logits/rejected": -2.902897357940674, "logps/chosen": -319.3989562988281, "logps/rejected": -347.49310302734375, "loss": 0.0941, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.141270399093628, "rewards/margins": 5.037647247314453, "rewards/rejected": -6.17891788482666, "step": 3520 }, { "epoch": 1.84720041862899, "grad_norm": 9.856710789511736, "learning_rate": 1.9318305922287268e-07, "logits/chosen": -2.910277843475342, "logits/rejected": -2.9003472328186035, "logps/chosen": -270.324462890625, "logps/rejected": -300.55841064453125, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": -0.6976989507675171, "rewards/margins": 5.929324626922607, "rewards/rejected": -6.627023220062256, "step": 3530 }, { "epoch": 1.8524332810047097, "grad_norm": 53.25569245574132, "learning_rate": 1.9170157462787762e-07, "logits/chosen": -2.995239019393921, "logits/rejected": -2.9091110229492188, "logps/chosen": -339.8304138183594, "logps/rejected": -309.0005798339844, "loss": 0.0986, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6631659269332886, "rewards/margins": 5.386892795562744, "rewards/rejected": -6.050058841705322, "step": 3540 }, { "epoch": 1.8576661433804291, "grad_norm": 19.41353091412009, "learning_rate": 1.902222518759891e-07, "logits/chosen": -3.0605530738830566, "logits/rejected": -2.9173171520233154, "logps/chosen": -360.20159912109375, "logps/rejected": -365.53131103515625, "loss": 0.1043, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4908387064933777, "rewards/margins": 5.567526817321777, "rewards/rejected": -6.058365821838379, "step": 3550 }, { "epoch": 1.8628990057561485, "grad_norm": 18.494349368581158, "learning_rate": 1.8874514582398368e-07, "logits/chosen": -2.947740316390991, "logits/rejected": -3.0022244453430176, "logps/chosen": -307.4051513671875, "logps/rejected": -347.0126037597656, "loss": 0.0843, "rewards/accuracies": 1.0, "rewards/chosen": -0.45441609621047974, "rewards/margins": 6.530531406402588, "rewards/rejected": -6.984947204589844, "step": 3560 }, { "epoch": 1.8681318681318682, "grad_norm": 14.39690947969377, "learning_rate": 1.8727031124643738e-07, "logits/chosen": -2.9303033351898193, "logits/rejected": -2.887834310531616, "logps/chosen": -233.7585906982422, "logps/rejected": -279.4310607910156, "loss": 0.079, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5730754137039185, "rewards/margins": 5.65393590927124, "rewards/rejected": -6.227011680603027, "step": 3570 }, { "epoch": 1.8733647305075878, "grad_norm": 26.14289315745987, "learning_rate": 1.8579780283369472e-07, "logits/chosen": -2.8734116554260254, "logits/rejected": -2.7617557048797607, "logps/chosen": -296.60650634765625, "logps/rejected": -271.07000732421875, "loss": 0.092, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.035385251045227, "rewards/margins": 4.872550964355469, "rewards/rejected": -5.9079365730285645, "step": 3580 }, { "epoch": 1.8785975928833072, "grad_norm": 14.735569299862146, "learning_rate": 1.8432767518984043e-07, "logits/chosen": -2.905574321746826, "logits/rejected": -2.8450348377227783, "logps/chosen": -301.3106994628906, "logps/rejected": -317.70989990234375, "loss": 0.1015, "rewards/accuracies": 1.0, "rewards/chosen": -0.8563224673271179, "rewards/margins": 5.4183349609375, "rewards/rejected": -6.274657249450684, "step": 3590 }, { "epoch": 1.8838304552590266, "grad_norm": 73.3213969659901, "learning_rate": 1.8285998283067478e-07, "logits/chosen": -3.000392436981201, "logits/rejected": -2.944305181503296, "logps/chosen": -271.4041748046875, "logps/rejected": -306.3927917480469, "loss": 0.1057, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7195259928703308, "rewards/margins": 6.0872602462768555, "rewards/rejected": -6.80678653717041, "step": 3600 }, { "epoch": 1.8890633176347462, "grad_norm": 48.843940004650975, "learning_rate": 1.8139478018169197e-07, "logits/chosen": -2.92897367477417, "logits/rejected": -2.8922219276428223, "logps/chosen": -254.1933135986328, "logps/rejected": -288.48748779296875, "loss": 0.089, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.1321945190429688, "rewards/margins": 5.379899024963379, "rewards/rejected": -6.512093544006348, "step": 3610 }, { "epoch": 1.8942961800104658, "grad_norm": 17.54415385107424, "learning_rate": 1.799321215760617e-07, "logits/chosen": -2.925396203994751, "logits/rejected": -2.9108407497406006, "logps/chosen": -272.95965576171875, "logps/rejected": -288.2476501464844, "loss": 0.1046, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.4124714136123657, "rewards/margins": 5.292130470275879, "rewards/rejected": -6.704601287841797, "step": 3620 }, { "epoch": 1.8995290423861853, "grad_norm": 34.98733154614837, "learning_rate": 1.7847206125261476e-07, "logits/chosen": -2.900238513946533, "logits/rejected": -2.906183958053589, "logps/chosen": -241.2446746826172, "logps/rejected": -275.1705627441406, "loss": 0.1103, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.0103797912597656, "rewards/margins": 5.282234191894531, "rewards/rejected": -6.292613983154297, "step": 3630 }, { "epoch": 1.9047619047619047, "grad_norm": 38.61903114436829, "learning_rate": 1.7701465335383148e-07, "logits/chosen": -2.9888854026794434, "logits/rejected": -2.885939836502075, "logps/chosen": -283.04052734375, "logps/rejected": -282.02044677734375, "loss": 0.1054, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.103050947189331, "rewards/margins": 4.716671466827393, "rewards/rejected": -5.819721698760986, "step": 3640 }, { "epoch": 1.9099947671376243, "grad_norm": 38.76806910264381, "learning_rate": 1.7555995192383377e-07, "logits/chosen": -2.9130234718322754, "logits/rejected": -2.9830563068389893, "logps/chosen": -248.6520233154297, "logps/rejected": -423.2144470214844, "loss": 0.0936, "rewards/accuracies": 1.0, "rewards/chosen": -0.5841989517211914, "rewards/margins": 6.109734535217285, "rewards/rejected": -6.693933963775635, "step": 3650 }, { "epoch": 1.915227629513344, "grad_norm": 43.85723942564594, "learning_rate": 1.7410801090638166e-07, "logits/chosen": -2.953160285949707, "logits/rejected": -2.899895429611206, "logps/chosen": -305.0296325683594, "logps/rejected": -312.41845703125, "loss": 0.106, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6891551613807678, "rewards/margins": 5.613497734069824, "rewards/rejected": -6.3026533126831055, "step": 3660 }, { "epoch": 1.9204604918890633, "grad_norm": 18.240536529525468, "learning_rate": 1.7265888414287245e-07, "logits/chosen": -2.9555230140686035, "logits/rejected": -2.9417948722839355, "logps/chosen": -288.87841796875, "logps/rejected": -326.5533752441406, "loss": 0.0955, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6330430507659912, "rewards/margins": 6.530220985412598, "rewards/rejected": -7.163262844085693, "step": 3670 }, { "epoch": 1.9256933542647827, "grad_norm": 49.48937582379989, "learning_rate": 1.7121262537034396e-07, "logits/chosen": -3.0035765171051025, "logits/rejected": -2.9037609100341797, "logps/chosen": -313.08721923828125, "logps/rejected": -316.5696716308594, "loss": 0.1094, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0678000450134277, "rewards/margins": 4.749475955963135, "rewards/rejected": -5.817276477813721, "step": 3680 }, { "epoch": 1.9309262166405023, "grad_norm": 33.578602411479636, "learning_rate": 1.697692882194826e-07, "logits/chosen": -2.8567192554473877, "logits/rejected": -2.8484914302825928, "logps/chosen": -229.86703491210938, "logps/rejected": -296.5517883300781, "loss": 0.0895, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9118927121162415, "rewards/margins": 5.025326251983643, "rewards/rejected": -5.93721866607666, "step": 3690 }, { "epoch": 1.936159079016222, "grad_norm": 32.31565122827746, "learning_rate": 1.6832892621263406e-07, "logits/chosen": -3.154453754425049, "logits/rejected": -3.0088248252868652, "logps/chosen": -345.74725341796875, "logps/rejected": -364.4647521972656, "loss": 0.1025, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.17001505196094513, "rewards/margins": 6.024691581726074, "rewards/rejected": -6.194705963134766, "step": 3700 }, { "epoch": 1.9413919413919414, "grad_norm": 15.161831349934443, "learning_rate": 1.668915927618183e-07, "logits/chosen": -2.9172425270080566, "logits/rejected": -2.9125847816467285, "logps/chosen": -216.3505096435547, "logps/rejected": -287.95440673828125, "loss": 0.0861, "rewards/accuracies": 1.0, "rewards/chosen": -0.781919538974762, "rewards/margins": 4.680075168609619, "rewards/rejected": -5.461994171142578, "step": 3710 }, { "epoch": 1.9466248037676608, "grad_norm": 14.158307101068146, "learning_rate": 1.6545734116674965e-07, "logits/chosen": -2.9995148181915283, "logits/rejected": -2.9643938541412354, "logps/chosen": -283.62249755859375, "logps/rejected": -275.6363830566406, "loss": 0.1, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.16857536137104034, "rewards/margins": 6.3334760665893555, "rewards/rejected": -6.164901256561279, "step": 3720 }, { "epoch": 1.9518576661433804, "grad_norm": 30.142917325257113, "learning_rate": 1.6402622461286e-07, "logits/chosen": -2.9114298820495605, "logits/rejected": -2.8543343544006348, "logps/chosen": -302.44757080078125, "logps/rejected": -310.2914123535156, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": -0.3990197777748108, "rewards/margins": 5.819601535797119, "rewards/rejected": -6.218621253967285, "step": 3730 }, { "epoch": 1.9570905285191, "grad_norm": 30.8236621378486, "learning_rate": 1.625982961693262e-07, "logits/chosen": -3.048753261566162, "logits/rejected": -2.8900704383850098, "logps/chosen": -333.42938232421875, "logps/rejected": -294.60888671875, "loss": 0.078, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.3006601929664612, "rewards/margins": 5.74398946762085, "rewards/rejected": -6.044649600982666, "step": 3740 }, { "epoch": 1.9623233908948194, "grad_norm": 43.44870422203161, "learning_rate": 1.6117360878710266e-07, "logits/chosen": -3.0620174407958984, "logits/rejected": -2.918151378631592, "logps/chosen": -310.9680480957031, "logps/rejected": -344.06298828125, "loss": 0.0999, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3079250156879425, "rewards/margins": 5.785721778869629, "rewards/rejected": -6.093646049499512, "step": 3750 }, { "epoch": 1.9675562532705388, "grad_norm": 30.101710729785086, "learning_rate": 1.5975221529695773e-07, "logits/chosen": -2.9273064136505127, "logits/rejected": -2.84016752243042, "logps/chosen": -226.0554656982422, "logps/rejected": -232.58676147460938, "loss": 0.1023, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9542840123176575, "rewards/margins": 4.591476917266846, "rewards/rejected": -5.5457611083984375, "step": 3760 }, { "epoch": 1.9727891156462585, "grad_norm": 16.51801980927947, "learning_rate": 1.5833416840751406e-07, "logits/chosen": -2.9456324577331543, "logits/rejected": -2.78047513961792, "logps/chosen": -241.0171356201172, "logps/rejected": -230.7863006591797, "loss": 0.0972, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.8832007646560669, "rewards/margins": 5.113748073577881, "rewards/rejected": -5.996949195861816, "step": 3770 }, { "epoch": 1.978021978021978, "grad_norm": 39.56856991971933, "learning_rate": 1.5691952070329493e-07, "logits/chosen": -3.033379077911377, "logits/rejected": -2.991994619369507, "logps/chosen": -329.7643737792969, "logps/rejected": -391.1466369628906, "loss": 0.1005, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.2172931432723999, "rewards/margins": 6.207066059112549, "rewards/rejected": -6.424359321594238, "step": 3780 }, { "epoch": 1.9832548403976975, "grad_norm": 31.387293037507856, "learning_rate": 1.555083246427734e-07, "logits/chosen": -2.8786675930023193, "logits/rejected": -2.8804256916046143, "logps/chosen": -318.87255859375, "logps/rejected": -349.0558776855469, "loss": 0.0975, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7609188556671143, "rewards/margins": 6.44631290435791, "rewards/rejected": -7.2072319984436035, "step": 3790 }, { "epoch": 1.988487702773417, "grad_norm": 9.82231069234189, "learning_rate": 1.5410063255642767e-07, "logits/chosen": -2.8453922271728516, "logits/rejected": -2.837798595428467, "logps/chosen": -274.1583557128906, "logps/rejected": -309.01226806640625, "loss": 0.0943, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.737366259098053, "rewards/margins": 5.473329544067383, "rewards/rejected": -6.210696220397949, "step": 3800 }, { "epoch": 1.9937205651491365, "grad_norm": 11.960501454551116, "learning_rate": 1.5269649664480037e-07, "logits/chosen": -2.9213318824768066, "logits/rejected": -2.9113833904266357, "logps/chosen": -321.54302978515625, "logps/rejected": -363.2264099121094, "loss": 0.0989, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.983420193195343, "rewards/margins": 5.245428562164307, "rewards/rejected": -6.228848457336426, "step": 3810 }, { "epoch": 1.9989534275248562, "grad_norm": 19.391136085665703, "learning_rate": 1.5129596897656255e-07, "logits/chosen": -2.899928569793701, "logits/rejected": -2.820655107498169, "logps/chosen": -283.9640808105469, "logps/rejected": -293.7140808105469, "loss": 0.061, "rewards/accuracies": 1.0, "rewards/chosen": -0.36992591619491577, "rewards/margins": 5.5189313888549805, "rewards/rejected": -5.888857364654541, "step": 3820 }, { "epoch": 2.004186289900576, "grad_norm": 3.226443458037429, "learning_rate": 1.4989910148658324e-07, "logits/chosen": -2.9906039237976074, "logits/rejected": -2.929668664932251, "logps/chosen": -285.82086181640625, "logps/rejected": -338.1489562988281, "loss": 0.022, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3656575381755829, "rewards/margins": 5.876527786254883, "rewards/rejected": -6.242185115814209, "step": 3830 }, { "epoch": 2.009419152276295, "grad_norm": 6.854227823189048, "learning_rate": 1.485059459740035e-07, "logits/chosen": -2.936378240585327, "logits/rejected": -2.8414175510406494, "logps/chosen": -306.9588928222656, "logps/rejected": -365.2276306152344, "loss": 0.0162, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.621353805065155, "rewards/margins": 7.0284223556518555, "rewards/rejected": -7.649776458740234, "step": 3840 }, { "epoch": 2.0146520146520146, "grad_norm": 5.424786181354322, "learning_rate": 1.4711655410031536e-07, "logits/chosen": -2.928842782974243, "logits/rejected": -2.856654644012451, "logps/chosen": -245.53433227539062, "logps/rejected": -287.20050048828125, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": -0.9247435331344604, "rewards/margins": 6.025231838226318, "rewards/rejected": -6.949975490570068, "step": 3850 }, { "epoch": 2.0198848770277342, "grad_norm": 7.975901095600039, "learning_rate": 1.4573097738744623e-07, "logits/chosen": -2.867645502090454, "logits/rejected": -2.864119291305542, "logps/chosen": -247.04833984375, "logps/rejected": -321.8243713378906, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": -0.738558292388916, "rewards/margins": 6.551937103271484, "rewards/rejected": -7.2904953956604, "step": 3860 }, { "epoch": 2.025117739403454, "grad_norm": 5.006568696740622, "learning_rate": 1.4434926721584865e-07, "logits/chosen": -2.9452812671661377, "logits/rejected": -2.8177547454833984, "logps/chosen": -269.35235595703125, "logps/rejected": -339.67596435546875, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -0.36390841007232666, "rewards/margins": 7.587021827697754, "rewards/rejected": -7.950931549072266, "step": 3870 }, { "epoch": 2.030350601779173, "grad_norm": 1.7007580337206254, "learning_rate": 1.4297147482259424e-07, "logits/chosen": -2.9536361694335938, "logits/rejected": -2.890812635421753, "logps/chosen": -269.464599609375, "logps/rejected": -294.1587829589844, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -0.9002802968025208, "rewards/margins": 6.895692348480225, "rewards/rejected": -7.7959723472595215, "step": 3880 }, { "epoch": 2.0355834641548927, "grad_norm": 1.6710758465210216, "learning_rate": 1.4159765129947443e-07, "logits/chosen": -2.9906704425811768, "logits/rejected": -2.9642977714538574, "logps/chosen": -245.62216186523438, "logps/rejected": -300.7028503417969, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -0.7113342881202698, "rewards/margins": 7.821895599365234, "rewards/rejected": -8.533231735229492, "step": 3890 }, { "epoch": 2.0408163265306123, "grad_norm": 5.195669109368125, "learning_rate": 1.4022784759110576e-07, "logits/chosen": -2.863924741744995, "logits/rejected": -2.805095672607422, "logps/chosen": -281.0409240722656, "logps/rejected": -343.31829833984375, "loss": 0.0088, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8401035070419312, "rewards/margins": 6.752278804779053, "rewards/rejected": -8.592382431030273, "step": 3900 }, { "epoch": 2.046049188906332, "grad_norm": 21.589135033736866, "learning_rate": 1.3886211449304002e-07, "logits/chosen": -2.889181137084961, "logits/rejected": -2.8925070762634277, "logps/chosen": -249.3176727294922, "logps/rejected": -412.71124267578125, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -1.571118712425232, "rewards/margins": 7.503713130950928, "rewards/rejected": -9.07483196258545, "step": 3910 }, { "epoch": 2.051282051282051, "grad_norm": 3.3464316171626294, "learning_rate": 1.3750050264988172e-07, "logits/chosen": -2.8536133766174316, "logits/rejected": -2.9086737632751465, "logps/chosen": -188.2241973876953, "logps/rejected": -318.10552978515625, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -0.4670838415622711, "rewards/margins": 8.878015518188477, "rewards/rejected": -9.345098495483398, "step": 3920 }, { "epoch": 2.0565149136577707, "grad_norm": 1.150137338297148, "learning_rate": 1.3614306255340918e-07, "logits/chosen": -2.990967035293579, "logits/rejected": -2.8272769451141357, "logps/chosen": -289.8829040527344, "logps/rejected": -297.67401123046875, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -0.7753466367721558, "rewards/margins": 7.968071937561035, "rewards/rejected": -8.743417739868164, "step": 3930 }, { "epoch": 2.0617477760334904, "grad_norm": 2.6144403947073487, "learning_rate": 1.347898445407027e-07, "logits/chosen": -2.896030902862549, "logits/rejected": -2.824877977371216, "logps/chosen": -308.11328125, "logps/rejected": -363.9369201660156, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -0.7604565024375916, "rewards/margins": 8.344852447509766, "rewards/rejected": -9.105307579040527, "step": 3940 }, { "epoch": 2.06698063840921, "grad_norm": 1.5264593469434302, "learning_rate": 1.3344089879227768e-07, "logits/chosen": -2.9457814693450928, "logits/rejected": -2.884833335876465, "logps/chosen": -323.90606689453125, "logps/rejected": -346.5791015625, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -1.3230036497116089, "rewards/margins": 8.264123916625977, "rewards/rejected": -9.587126731872559, "step": 3950 }, { "epoch": 2.072213500784929, "grad_norm": 0.8181588675537088, "learning_rate": 1.3209627533022393e-07, "logits/chosen": -2.7985682487487793, "logits/rejected": -2.8037781715393066, "logps/chosen": -304.4876708984375, "logps/rejected": -359.4815673828125, "loss": 0.007, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0158947706222534, "rewards/margins": 8.387866973876953, "rewards/rejected": -9.40376091003418, "step": 3960 }, { "epoch": 2.077446363160649, "grad_norm": 5.208914346563764, "learning_rate": 1.3075602401635056e-07, "logits/chosen": -2.8890557289123535, "logits/rejected": -2.8292429447174072, "logps/chosen": -227.6912078857422, "logps/rejected": -233.0723114013672, "loss": 0.0169, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.9975049495697021, "rewards/margins": 6.517954349517822, "rewards/rejected": -8.515460014343262, "step": 3970 }, { "epoch": 2.0826792255363684, "grad_norm": 15.712567968328464, "learning_rate": 1.2942019455033715e-07, "logits/chosen": -2.915278673171997, "logits/rejected": -2.8793892860412598, "logps/chosen": -358.57940673828125, "logps/rejected": -383.87591552734375, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -1.4254333972930908, "rewards/margins": 8.060611724853516, "rewards/rejected": -9.486045837402344, "step": 3980 }, { "epoch": 2.087912087912088, "grad_norm": 1.0020260330031099, "learning_rate": 1.2808883646789088e-07, "logits/chosen": -2.9130358695983887, "logits/rejected": -2.8442845344543457, "logps/chosen": -269.8409729003906, "logps/rejected": -335.58087158203125, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -1.3813385963439941, "rewards/margins": 8.000204086303711, "rewards/rejected": -9.381542205810547, "step": 3990 }, { "epoch": 2.0931449502878072, "grad_norm": 2.1313305645032012, "learning_rate": 1.2676199913890933e-07, "logits/chosen": -2.8041138648986816, "logits/rejected": -2.7163589000701904, "logps/chosen": -295.2092590332031, "logps/rejected": -312.94775390625, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -1.6783853769302368, "rewards/margins": 6.666558265686035, "rewards/rejected": -8.344942092895508, "step": 4000 }, { "epoch": 2.0931449502878072, "eval_logits/chosen": -2.875235080718994, "eval_logits/rejected": -2.8350534439086914, "eval_logps/chosen": -302.49029541015625, "eval_logps/rejected": -333.1784362792969, "eval_loss": 0.7077480554580688, "eval_rewards/accuracies": 0.78125, "eval_rewards/chosen": -4.361080169677734, "eval_rewards/margins": 2.640265941619873, "eval_rewards/rejected": -7.001346588134766, "eval_runtime": 96.4702, "eval_samples_per_second": 20.732, "eval_steps_per_second": 0.332, "step": 4000 }, { "epoch": 2.098377812663527, "grad_norm": 1.5419449260767126, "learning_rate": 1.2543973176565012e-07, "logits/chosen": -2.8510260581970215, "logits/rejected": -2.8032736778259277, "logps/chosen": -224.27072143554688, "logps/rejected": -322.1229553222656, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -1.844692587852478, "rewards/margins": 8.92339038848877, "rewards/rejected": -10.768081665039062, "step": 4010 }, { "epoch": 2.1036106750392465, "grad_norm": 2.8771549994155796, "learning_rate": 1.2412208338090565e-07, "logits/chosen": -2.9542076587677, "logits/rejected": -2.9085967540740967, "logps/chosen": -346.3165283203125, "logps/rejected": -409.2415771484375, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -1.5699872970581055, "rewards/margins": 8.486124992370605, "rewards/rejected": -10.056112289428711, "step": 4020 }, { "epoch": 2.108843537414966, "grad_norm": 1.7958197870492556, "learning_rate": 1.228091028461858e-07, "logits/chosen": -2.924954891204834, "logits/rejected": -2.8718249797821045, "logps/chosen": -268.8287048339844, "logps/rejected": -378.38360595703125, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -1.8416907787322998, "rewards/margins": 8.17067813873291, "rewards/rejected": -10.012369155883789, "step": 4030 }, { "epoch": 2.1140763997906853, "grad_norm": 3.9722337679826154, "learning_rate": 1.2150083884990536e-07, "logits/chosen": -2.919137477874756, "logits/rejected": -2.8333396911621094, "logps/chosen": -289.49420166015625, "logps/rejected": -357.74322509765625, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -2.1199421882629395, "rewards/margins": 8.458976745605469, "rewards/rejected": -10.57891845703125, "step": 4040 }, { "epoch": 2.119309262166405, "grad_norm": 1.1008363327625712, "learning_rate": 1.201973399055788e-07, "logits/chosen": -3.006373643875122, "logits/rejected": -2.9574332237243652, "logps/chosen": -330.5584411621094, "logps/rejected": -372.254150390625, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -1.5476634502410889, "rewards/margins": 8.164642333984375, "rewards/rejected": -9.712306022644043, "step": 4050 }, { "epoch": 2.1245421245421245, "grad_norm": 0.815197076931593, "learning_rate": 1.1889865435002117e-07, "logits/chosen": -2.990003824234009, "logits/rejected": -2.9572081565856934, "logps/chosen": -292.25714111328125, "logps/rejected": -359.2685852050781, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -1.2002347707748413, "rewards/margins": 7.958550930023193, "rewards/rejected": -9.158784866333008, "step": 4060 }, { "epoch": 2.129774986917844, "grad_norm": 1.3162248645643395, "learning_rate": 1.1760483034155588e-07, "logits/chosen": -2.8831801414489746, "logits/rejected": -2.854194164276123, "logps/chosen": -285.760986328125, "logps/rejected": -368.67315673828125, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -2.3038716316223145, "rewards/margins": 8.89173698425293, "rewards/rejected": -11.195609092712402, "step": 4070 }, { "epoch": 2.1350078492935634, "grad_norm": 3.181904749517599, "learning_rate": 1.163159158582284e-07, "logits/chosen": -2.8032143115997314, "logits/rejected": -2.8025825023651123, "logps/chosen": -291.22113037109375, "logps/rejected": -366.7909851074219, "loss": 0.0183, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.4583861827850342, "rewards/margins": 9.16749382019043, "rewards/rejected": -10.625879287719727, "step": 4080 }, { "epoch": 2.140240711669283, "grad_norm": 5.137499238942955, "learning_rate": 1.1503195869602766e-07, "logits/chosen": -2.870023012161255, "logits/rejected": -2.7650625705718994, "logps/chosen": -276.405517578125, "logps/rejected": -336.3729248046875, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": -1.8698413372039795, "rewards/margins": 9.982603073120117, "rewards/rejected": -11.852442741394043, "step": 4090 }, { "epoch": 2.1454735740450026, "grad_norm": 5.481998459620851, "learning_rate": 1.137530064671135e-07, "logits/chosen": -2.841693639755249, "logits/rejected": -2.909789800643921, "logps/chosen": -245.60482788085938, "logps/rejected": -352.0904235839844, "loss": 0.0128, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.7970342636108398, "rewards/margins": 7.849973201751709, "rewards/rejected": -9.647008895874023, "step": 4100 }, { "epoch": 2.1507064364207222, "grad_norm": 4.114957280500983, "learning_rate": 1.1247910659805063e-07, "logits/chosen": -2.928069591522217, "logits/rejected": -2.8762078285217285, "logps/chosen": -321.8515319824219, "logps/rejected": -304.36468505859375, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -2.0382261276245117, "rewards/margins": 8.376965522766113, "rewards/rejected": -10.415192604064941, "step": 4110 }, { "epoch": 2.155939298796442, "grad_norm": 1.9046806482625325, "learning_rate": 1.112103063280509e-07, "logits/chosen": -2.878873109817505, "logits/rejected": -2.7525992393493652, "logps/chosen": -263.1964111328125, "logps/rejected": -406.6435546875, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -1.250799298286438, "rewards/margins": 9.15456771850586, "rewards/rejected": -10.405366897583008, "step": 4120 }, { "epoch": 2.161172161172161, "grad_norm": 1.5065895848136046, "learning_rate": 1.099466527072207e-07, "logits/chosen": -2.850222110748291, "logits/rejected": -2.852191686630249, "logps/chosen": -228.3606414794922, "logps/rejected": -362.36175537109375, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -1.6445125341415405, "rewards/margins": 8.477560043334961, "rewards/rejected": -10.122072219848633, "step": 4130 }, { "epoch": 2.1664050235478807, "grad_norm": 2.594425613802903, "learning_rate": 1.0868819259481638e-07, "logits/chosen": -2.8431077003479004, "logits/rejected": -2.7107739448547363, "logps/chosen": -295.4981994628906, "logps/rejected": -291.9450988769531, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -2.481992483139038, "rewards/margins": 8.041691780090332, "rewards/rejected": -10.523683547973633, "step": 4140 }, { "epoch": 2.1716378859236003, "grad_norm": 6.731182131911597, "learning_rate": 1.0743497265750701e-07, "logits/chosen": -2.9597175121307373, "logits/rejected": -2.890862226486206, "logps/chosen": -282.6196594238281, "logps/rejected": -359.4171447753906, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -1.7797727584838867, "rewards/margins": 8.09226131439209, "rewards/rejected": -9.872034072875977, "step": 4150 }, { "epoch": 2.17687074829932, "grad_norm": 2.947186809527405, "learning_rate": 1.0618703936764359e-07, "logits/chosen": -2.9624671936035156, "logits/rejected": -2.8392415046691895, "logps/chosen": -317.45025634765625, "logps/rejected": -398.1184997558594, "loss": 0.0106, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.7889065742492676, "rewards/margins": 9.177200317382812, "rewards/rejected": -11.966105461120605, "step": 4160 }, { "epoch": 2.182103610675039, "grad_norm": 1.6210552596245766, "learning_rate": 1.0494443900153557e-07, "logits/chosen": -2.9592700004577637, "logits/rejected": -2.8041563034057617, "logps/chosen": -310.41729736328125, "logps/rejected": -360.01031494140625, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -1.5765081644058228, "rewards/margins": 8.299886703491211, "rewards/rejected": -9.876394271850586, "step": 4170 }, { "epoch": 2.1873364730507587, "grad_norm": 0.8775807666257988, "learning_rate": 1.0370721763773507e-07, "logits/chosen": -2.9034318923950195, "logits/rejected": -2.754986524581909, "logps/chosen": -341.43121337890625, "logps/rejected": -355.4364318847656, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -1.5358606576919556, "rewards/margins": 9.34194278717041, "rewards/rejected": -10.877802848815918, "step": 4180 }, { "epoch": 2.1925693354264784, "grad_norm": 7.190182230593559, "learning_rate": 1.0247542115532845e-07, "logits/chosen": -2.8554604053497314, "logits/rejected": -2.8050451278686523, "logps/chosen": -298.0216369628906, "logps/rejected": -355.5791320800781, "loss": 0.0175, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.570692539215088, "rewards/margins": 8.322811126708984, "rewards/rejected": -10.893505096435547, "step": 4190 }, { "epoch": 2.197802197802198, "grad_norm": 19.891127844803, "learning_rate": 1.0124909523223418e-07, "logits/chosen": -2.8431711196899414, "logits/rejected": -2.79714035987854, "logps/chosen": -311.7491760253906, "logps/rejected": -366.1868591308594, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": -2.247616767883301, "rewards/margins": 9.083130836486816, "rewards/rejected": -11.3307466506958, "step": 4200 }, { "epoch": 2.203035060177917, "grad_norm": 0.767010561032602, "learning_rate": 1.0002828534350987e-07, "logits/chosen": -2.9404544830322266, "logits/rejected": -2.848851203918457, "logps/chosen": -339.2194519042969, "logps/rejected": -364.5045471191406, "loss": 0.0175, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.409802198410034, "rewards/margins": 7.917397499084473, "rewards/rejected": -10.32719898223877, "step": 4210 }, { "epoch": 2.208267922553637, "grad_norm": 7.397743767922527, "learning_rate": 9.881303675966524e-08, "logits/chosen": -2.870842933654785, "logits/rejected": -2.7748990058898926, "logps/chosen": -298.0293273925781, "logps/rejected": -362.5297546386719, "loss": 0.0069, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.641525983810425, "rewards/margins": 8.677454948425293, "rewards/rejected": -11.318981170654297, "step": 4220 }, { "epoch": 2.2135007849293564, "grad_norm": 1.5980129800991065, "learning_rate": 9.760339454498393e-08, "logits/chosen": -2.725639820098877, "logits/rejected": -2.7300057411193848, "logps/chosen": -250.8970184326172, "logps/rejected": -319.5538024902344, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -2.5505714416503906, "rewards/margins": 8.494199752807617, "rewards/rejected": -11.044769287109375, "step": 4230 }, { "epoch": 2.218733647305076, "grad_norm": 2.7637120092611203, "learning_rate": 9.639940355585218e-08, "logits/chosen": -2.9450080394744873, "logits/rejected": -2.908446788787842, "logps/chosen": -297.3518981933594, "logps/rejected": -380.93487548828125, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -3.1906774044036865, "rewards/margins": 7.418745517730713, "rewards/rejected": -10.60942268371582, "step": 4240 }, { "epoch": 2.2239665096807952, "grad_norm": 15.838669159602834, "learning_rate": 9.52011084390954e-08, "logits/chosen": -2.878129005432129, "logits/rejected": -2.859032392501831, "logps/chosen": -286.35406494140625, "logps/rejected": -353.4107360839844, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -2.5251975059509277, "rewards/margins": 8.706304550170898, "rewards/rejected": -11.231501579284668, "step": 4250 }, { "epoch": 2.229199372056515, "grad_norm": 3.761275164623466, "learning_rate": 9.400855363032262e-08, "logits/chosen": -2.8847315311431885, "logits/rejected": -2.9110546112060547, "logps/chosen": -309.8160095214844, "logps/rejected": -395.5894470214844, "loss": 0.0135, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8557885885238647, "rewards/margins": 9.05466079711914, "rewards/rejected": -10.91045093536377, "step": 4260 }, { "epoch": 2.2344322344322345, "grad_norm": 1.3758252945018405, "learning_rate": 9.282178335227883e-08, "logits/chosen": -2.865054130554199, "logits/rejected": -2.8139474391937256, "logps/chosen": -275.11956787109375, "logps/rejected": -374.24786376953125, "loss": 0.0057, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.454589366912842, "rewards/margins": 8.777228355407715, "rewards/rejected": -11.231819152832031, "step": 4270 }, { "epoch": 2.239665096807954, "grad_norm": 4.598664260094266, "learning_rate": 9.164084161320471e-08, "logits/chosen": -2.8596701622009277, "logits/rejected": -2.7449417114257812, "logps/chosen": -279.70465087890625, "logps/rejected": -354.41314697265625, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -2.293121337890625, "rewards/margins": 9.623333930969238, "rewards/rejected": -11.916454315185547, "step": 4280 }, { "epoch": 2.2448979591836733, "grad_norm": 0.9729956891663278, "learning_rate": 9.046577220520518e-08, "logits/chosen": -2.862408399581909, "logits/rejected": -2.787109375, "logps/chosen": -279.13470458984375, "logps/rejected": -352.298828125, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -2.8487582206726074, "rewards/margins": 8.20508098602295, "rewards/rejected": -11.053838729858398, "step": 4290 }, { "epoch": 2.250130821559393, "grad_norm": 6.282293090699428, "learning_rate": 8.929661870262525e-08, "logits/chosen": -3.009525775909424, "logits/rejected": -2.9176905155181885, "logps/chosen": -389.9200439453125, "logps/rejected": -381.29046630859375, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -1.7698036432266235, "rewards/margins": 8.699227333068848, "rewards/rejected": -10.469030380249023, "step": 4300 }, { "epoch": 2.2553636839351126, "grad_norm": 15.657147640048068, "learning_rate": 8.813342446043423e-08, "logits/chosen": -2.9324398040771484, "logits/rejected": -2.8230299949645996, "logps/chosen": -290.99395751953125, "logps/rejected": -317.4901123046875, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": -2.367929220199585, "rewards/margins": 8.228096961975098, "rewards/rejected": -10.596026420593262, "step": 4310 }, { "epoch": 2.260596546310832, "grad_norm": 2.858292396647785, "learning_rate": 8.697623261261788e-08, "logits/chosen": -2.852550983428955, "logits/rejected": -2.8406786918640137, "logps/chosen": -264.0799865722656, "logps/rejected": -378.6510314941406, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -1.8443784713745117, "rewards/margins": 10.363582611083984, "rewards/rejected": -12.207961082458496, "step": 4320 }, { "epoch": 2.2658294086865514, "grad_norm": 1.4830353278631723, "learning_rate": 8.58250860705792e-08, "logits/chosen": -2.970691680908203, "logits/rejected": -2.912869453430176, "logps/chosen": -330.68994140625, "logps/rejected": -381.26727294921875, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -1.7346365451812744, "rewards/margins": 8.810033798217773, "rewards/rejected": -10.544671058654785, "step": 4330 }, { "epoch": 2.271062271062271, "grad_norm": 0.9684007577562029, "learning_rate": 8.468002752154671e-08, "logits/chosen": -2.988107442855835, "logits/rejected": -2.877903461456299, "logps/chosen": -324.2165832519531, "logps/rejected": -356.0824890136719, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -1.9134773015975952, "rewards/margins": 8.429335594177246, "rewards/rejected": -10.342813491821289, "step": 4340 }, { "epoch": 2.2762951334379906, "grad_norm": 5.166928976195506, "learning_rate": 8.354109942699208e-08, "logits/chosen": -2.9177370071411133, "logits/rejected": -2.8659799098968506, "logps/chosen": -293.4275817871094, "logps/rejected": -353.8664855957031, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -2.341238021850586, "rewards/margins": 8.192709922790527, "rewards/rejected": -10.533947944641113, "step": 4350 }, { "epoch": 2.2815279958137102, "grad_norm": 1.4727370183754902, "learning_rate": 8.240834402105524e-08, "logits/chosen": -2.843801736831665, "logits/rejected": -2.774552822113037, "logps/chosen": -319.27716064453125, "logps/rejected": -338.53692626953125, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -1.412759780883789, "rewards/margins": 8.236115455627441, "rewards/rejected": -9.648874282836914, "step": 4360 }, { "epoch": 2.2867608581894294, "grad_norm": 10.698831247775582, "learning_rate": 8.128180330897791e-08, "logits/chosen": -2.7989373207092285, "logits/rejected": -2.847008466720581, "logps/chosen": -301.494140625, "logps/rejected": -421.40972900390625, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -2.198242664337158, "rewards/margins": 10.001172065734863, "rewards/rejected": -12.199414253234863, "step": 4370 }, { "epoch": 2.291993720565149, "grad_norm": 1.6159310291539648, "learning_rate": 8.016151906554683e-08, "logits/chosen": -2.9029006958007812, "logits/rejected": -2.9015581607818604, "logps/chosen": -279.3041687011719, "logps/rejected": -447.77105712890625, "loss": 0.0131, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.9287971258163452, "rewards/margins": 9.190216064453125, "rewards/rejected": -11.119011878967285, "step": 4380 }, { "epoch": 2.2972265829408687, "grad_norm": 2.135886196596052, "learning_rate": 7.90475328335439e-08, "logits/chosen": -2.8774006366729736, "logits/rejected": -2.81954288482666, "logps/chosen": -239.5826873779297, "logps/rejected": -311.48907470703125, "loss": 0.0162, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.5247304439544678, "rewards/margins": 8.302604675292969, "rewards/rejected": -10.827333450317383, "step": 4390 }, { "epoch": 2.3024594453165883, "grad_norm": 7.217138056086161, "learning_rate": 7.793988592220568e-08, "logits/chosen": -2.865084171295166, "logits/rejected": -2.7769336700439453, "logps/chosen": -296.4810485839844, "logps/rejected": -340.00469970703125, "loss": 0.0182, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.5750186443328857, "rewards/margins": 7.714239597320557, "rewards/rejected": -10.289258003234863, "step": 4400 }, { "epoch": 2.3076923076923075, "grad_norm": 1.4812452325086167, "learning_rate": 7.683861940569217e-08, "logits/chosen": -2.886091947555542, "logits/rejected": -2.826148509979248, "logps/chosen": -354.67462158203125, "logps/rejected": -368.5892639160156, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -2.558478832244873, "rewards/margins": 8.2522554397583, "rewards/rejected": -10.810735702514648, "step": 4410 }, { "epoch": 2.312925170068027, "grad_norm": 11.095467376164303, "learning_rate": 7.574377412156291e-08, "logits/chosen": -2.895280122756958, "logits/rejected": -2.7479138374328613, "logps/chosen": -291.3760986328125, "logps/rejected": -327.7812194824219, "loss": 0.018, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.644456386566162, "rewards/margins": 8.322864532470703, "rewards/rejected": -10.967321395874023, "step": 4420 }, { "epoch": 2.3181580324437467, "grad_norm": 2.567240285295665, "learning_rate": 7.465539066926322e-08, "logits/chosen": -2.8348584175109863, "logits/rejected": -2.8111917972564697, "logps/chosen": -307.2774963378906, "logps/rejected": -343.26385498046875, "loss": 0.0173, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.5507636070251465, "rewards/margins": 8.764577865600586, "rewards/rejected": -11.315340995788574, "step": 4430 }, { "epoch": 2.3233908948194664, "grad_norm": 6.8196986924338905, "learning_rate": 7.357350940861845e-08, "logits/chosen": -2.9440054893493652, "logits/rejected": -2.9104747772216797, "logps/chosen": -350.57159423828125, "logps/rejected": -448.69207763671875, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -2.7537646293640137, "rewards/margins": 8.89330005645752, "rewards/rejected": -11.647064208984375, "step": 4440 }, { "epoch": 2.328623757195186, "grad_norm": 3.4006570983074575, "learning_rate": 7.249817045833726e-08, "logits/chosen": -2.8378310203552246, "logits/rejected": -2.8005566596984863, "logps/chosen": -289.21942138671875, "logps/rejected": -330.5621337890625, "loss": 0.0161, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.4763882160186768, "rewards/margins": 8.542319297790527, "rewards/rejected": -11.018709182739258, "step": 4450 }, { "epoch": 2.333856619570905, "grad_norm": 1.1244912218077208, "learning_rate": 7.14294136945241e-08, "logits/chosen": -2.887730121612549, "logits/rejected": -2.8157246112823486, "logps/chosen": -296.88812255859375, "logps/rejected": -376.61163330078125, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -1.3376476764678955, "rewards/margins": 9.742364883422852, "rewards/rejected": -11.080012321472168, "step": 4460 }, { "epoch": 2.339089481946625, "grad_norm": 3.6228411833826217, "learning_rate": 7.036727874920043e-08, "logits/chosen": -2.729278326034546, "logits/rejected": -2.713383197784424, "logps/chosen": -284.9239196777344, "logps/rejected": -383.20172119140625, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -2.55851674079895, "rewards/margins": 9.046324729919434, "rewards/rejected": -11.604841232299805, "step": 4470 }, { "epoch": 2.3443223443223444, "grad_norm": 2.804591129673783, "learning_rate": 6.931180500883484e-08, "logits/chosen": -2.8510024547576904, "logits/rejected": -2.7999136447906494, "logps/chosen": -243.95718383789062, "logps/rejected": -292.40576171875, "loss": 0.0108, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.4016222953796387, "rewards/margins": 7.739932060241699, "rewards/rejected": -10.141555786132812, "step": 4480 }, { "epoch": 2.3495552066980636, "grad_norm": 1.4212992580674872, "learning_rate": 6.826303161288302e-08, "logits/chosen": -2.755215644836426, "logits/rejected": -2.673293352127075, "logps/chosen": -259.36334228515625, "logps/rejected": -344.40625, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -2.680428981781006, "rewards/margins": 9.432083129882812, "rewards/rejected": -12.11251163482666, "step": 4490 }, { "epoch": 2.3547880690737832, "grad_norm": 58.27312735591005, "learning_rate": 6.722099745233594e-08, "logits/chosen": -2.9853272438049316, "logits/rejected": -2.8632123470306396, "logps/chosen": -341.893798828125, "logps/rejected": -378.6851501464844, "loss": 0.0109, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.528996467590332, "rewards/margins": 8.846960067749023, "rewards/rejected": -11.375957489013672, "step": 4500 }, { "epoch": 2.360020931449503, "grad_norm": 14.51413658947118, "learning_rate": 6.618574116827786e-08, "logits/chosen": -2.8783841133117676, "logits/rejected": -2.8406574726104736, "logps/chosen": -267.320068359375, "logps/rejected": -354.52301025390625, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -2.8604700565338135, "rewards/margins": 8.928609848022461, "rewards/rejected": -11.789079666137695, "step": 4510 }, { "epoch": 2.3652537938252225, "grad_norm": 2.593079375528599, "learning_rate": 6.515730115045339e-08, "logits/chosen": -2.937530994415283, "logits/rejected": -2.8431241512298584, "logps/chosen": -336.4122619628906, "logps/rejected": -385.5653381347656, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": -2.4173896312713623, "rewards/margins": 10.133063316345215, "rewards/rejected": -12.550455093383789, "step": 4520 }, { "epoch": 2.370486656200942, "grad_norm": 5.8560008226843925, "learning_rate": 6.413571553584399e-08, "logits/chosen": -2.826537609100342, "logits/rejected": -2.7623977661132812, "logps/chosen": -292.3869934082031, "logps/rejected": -360.14227294921875, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -3.0917422771453857, "rewards/margins": 8.068608283996582, "rewards/rejected": -11.160350799560547, "step": 4530 }, { "epoch": 2.3757195185766613, "grad_norm": 5.531507200254094, "learning_rate": 6.312102220725346e-08, "logits/chosen": -2.9753031730651855, "logits/rejected": -2.833451986312866, "logps/chosen": -371.5586853027344, "logps/rejected": -396.01971435546875, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -2.391232967376709, "rewards/margins": 9.879590034484863, "rewards/rejected": -12.270822525024414, "step": 4540 }, { "epoch": 2.380952380952381, "grad_norm": 1.6363968615870823, "learning_rate": 6.21132587919036e-08, "logits/chosen": -2.927316665649414, "logits/rejected": -2.8438987731933594, "logps/chosen": -312.6570739746094, "logps/rejected": -389.4886169433594, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -2.3138856887817383, "rewards/margins": 9.4051513671875, "rewards/rejected": -11.719037055969238, "step": 4550 }, { "epoch": 2.3861852433281006, "grad_norm": 0.7728268823255452, "learning_rate": 6.111246266003859e-08, "logits/chosen": -2.786463737487793, "logits/rejected": -2.73313570022583, "logps/chosen": -343.6289978027344, "logps/rejected": -434.4911193847656, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -2.3142333030700684, "rewards/margins": 10.09544849395752, "rewards/rejected": -12.40968132019043, "step": 4560 }, { "epoch": 2.3914181057038197, "grad_norm": 1.940529315340631, "learning_rate": 6.011867092353934e-08, "logits/chosen": -2.891645908355713, "logits/rejected": -2.7753002643585205, "logps/chosen": -317.1549072265625, "logps/rejected": -330.22637939453125, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -2.583204746246338, "rewards/margins": 8.67894458770752, "rewards/rejected": -11.262149810791016, "step": 4570 }, { "epoch": 2.3966509680795394, "grad_norm": 1.5039771259847803, "learning_rate": 5.9131920434547235e-08, "logits/chosen": -2.7718305587768555, "logits/rejected": -2.8004114627838135, "logps/chosen": -352.2939147949219, "logps/rejected": -452.0146484375, "loss": 0.0145, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.55383038520813, "rewards/margins": 9.886983871459961, "rewards/rejected": -12.440813064575195, "step": 4580 }, { "epoch": 2.401883830455259, "grad_norm": 0.8277165840245028, "learning_rate": 5.8152247784097664e-08, "logits/chosen": -2.8936140537261963, "logits/rejected": -2.830893039703369, "logps/chosen": -351.01556396484375, "logps/rejected": -431.44818115234375, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -1.7904497385025024, "rewards/margins": 11.320137023925781, "rewards/rejected": -13.110588073730469, "step": 4590 }, { "epoch": 2.4071166928309786, "grad_norm": 3.985183306351277, "learning_rate": 5.717968930076289e-08, "logits/chosen": -2.8917033672332764, "logits/rejected": -2.85483980178833, "logps/chosen": -253.35891723632812, "logps/rejected": -345.45892333984375, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -2.432149648666382, "rewards/margins": 10.088820457458496, "rewards/rejected": -12.520971298217773, "step": 4600 }, { "epoch": 2.4123495552066982, "grad_norm": 0.6808395473992981, "learning_rate": 5.621428104930528e-08, "logits/chosen": -2.710055112838745, "logits/rejected": -2.665837049484253, "logps/chosen": -229.9737548828125, "logps/rejected": -356.55999755859375, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -3.284646987915039, "rewards/margins": 10.69596004486084, "rewards/rejected": -13.980607986450195, "step": 4610 }, { "epoch": 2.4175824175824174, "grad_norm": 0.6015768056933132, "learning_rate": 5.525605882933965e-08, "logits/chosen": -2.8241562843322754, "logits/rejected": -2.81707763671875, "logps/chosen": -291.40899658203125, "logps/rejected": -374.1553955078125, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -2.6395621299743652, "rewards/margins": 9.293486595153809, "rewards/rejected": -11.933049201965332, "step": 4620 }, { "epoch": 2.422815279958137, "grad_norm": 32.33318363571996, "learning_rate": 5.4305058174005853e-08, "logits/chosen": -2.758427858352661, "logits/rejected": -2.7463533878326416, "logps/chosen": -408.1051330566406, "logps/rejected": -459.46038818359375, "loss": 0.0126, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.7850792407989502, "rewards/margins": 11.271986961364746, "rewards/rejected": -13.05706787109375, "step": 4630 }, { "epoch": 2.4280481423338567, "grad_norm": 2.6330170525190257, "learning_rate": 5.33613143486511e-08, "logits/chosen": -2.8177361488342285, "logits/rejected": -2.6916282176971436, "logps/chosen": -347.5819091796875, "logps/rejected": -352.9397277832031, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -2.2173008918762207, "rewards/margins": 9.733224868774414, "rewards/rejected": -11.950526237487793, "step": 4640 }, { "epoch": 2.4332810047095763, "grad_norm": 0.8402037469904734, "learning_rate": 5.242486234952206e-08, "logits/chosen": -2.8267412185668945, "logits/rejected": -2.747302532196045, "logps/chosen": -312.5420837402344, "logps/rejected": -349.3916015625, "loss": 0.0063, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.8949179649353027, "rewards/margins": 8.266252517700195, "rewards/rejected": -11.161170959472656, "step": 4650 }, { "epoch": 2.4385138670852955, "grad_norm": 0.5619502836731689, "learning_rate": 5.149573690246758e-08, "logits/chosen": -2.8437860012054443, "logits/rejected": -2.7873828411102295, "logps/chosen": -336.22540283203125, "logps/rejected": -380.4892578125, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -2.5367541313171387, "rewards/margins": 9.541874885559082, "rewards/rejected": -12.078627586364746, "step": 4660 }, { "epoch": 2.443746729461015, "grad_norm": 0.7099918602816323, "learning_rate": 5.057397246165052e-08, "logits/chosen": -2.8400425910949707, "logits/rejected": -2.7845816612243652, "logps/chosen": -388.25396728515625, "logps/rejected": -386.9708251953125, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -3.137280225753784, "rewards/margins": 9.088190078735352, "rewards/rejected": -12.225470542907715, "step": 4670 }, { "epoch": 2.4489795918367347, "grad_norm": 3.188168874840576, "learning_rate": 4.9659603208270173e-08, "logits/chosen": -2.931820869445801, "logits/rejected": -2.7698841094970703, "logps/chosen": -374.2984619140625, "logps/rejected": -356.0014953613281, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -2.376450777053833, "rewards/margins": 8.353018760681152, "rewards/rejected": -10.729469299316406, "step": 4680 }, { "epoch": 2.4542124542124544, "grad_norm": 5.648447460535544, "learning_rate": 4.875266304929496e-08, "logits/chosen": -2.706265926361084, "logits/rejected": -2.6204583644866943, "logps/chosen": -253.626708984375, "logps/rejected": -316.3972473144531, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -2.9477574825286865, "rewards/margins": 8.929656982421875, "rewards/rejected": -11.877413749694824, "step": 4690 }, { "epoch": 2.4594453165881736, "grad_norm": 9.754913730476638, "learning_rate": 4.785318561620511e-08, "logits/chosen": -2.758218765258789, "logits/rejected": -2.763181447982788, "logps/chosen": -261.8508605957031, "logps/rejected": -377.4474792480469, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -3.8085179328918457, "rewards/margins": 9.278454780578613, "rewards/rejected": -13.0869722366333, "step": 4700 }, { "epoch": 2.464678178963893, "grad_norm": 8.3436239933347, "learning_rate": 4.696120426374503e-08, "logits/chosen": -2.674140453338623, "logits/rejected": -2.7273619174957275, "logps/chosen": -272.74053955078125, "logps/rejected": -384.50006103515625, "loss": 0.0132, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2627499103546143, "rewards/margins": 9.455053329467773, "rewards/rejected": -12.717803001403809, "step": 4710 }, { "epoch": 2.469911041339613, "grad_norm": 42.38356382127395, "learning_rate": 4.607675206868705e-08, "logits/chosen": -2.911252737045288, "logits/rejected": -2.8311667442321777, "logps/chosen": -294.817138671875, "logps/rejected": -335.45916748046875, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -2.9595112800598145, "rewards/margins": 9.083582878112793, "rewards/rejected": -12.043095588684082, "step": 4720 }, { "epoch": 2.4751439037153324, "grad_norm": 1.2686673568844424, "learning_rate": 4.519986182860452e-08, "logits/chosen": -2.8337135314941406, "logits/rejected": -2.70151948928833, "logps/chosen": -307.1358947753906, "logps/rejected": -323.981689453125, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -2.150207281112671, "rewards/margins": 8.236126899719238, "rewards/rejected": -10.386334419250488, "step": 4730 }, { "epoch": 2.4803767660910516, "grad_norm": 3.03528094244139, "learning_rate": 4.433056606065552e-08, "logits/chosen": -2.8454017639160156, "logits/rejected": -2.821680784225464, "logps/chosen": -258.31591796875, "logps/rejected": -362.5140075683594, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -2.541438579559326, "rewards/margins": 8.546899795532227, "rewards/rejected": -11.088338851928711, "step": 4740 }, { "epoch": 2.4856096284667712, "grad_norm": 0.8241877208372598, "learning_rate": 4.3468897000377427e-08, "logits/chosen": -3.0025548934936523, "logits/rejected": -2.9085915088653564, "logps/chosen": -306.0249328613281, "logps/rejected": -350.19537353515625, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -2.320474147796631, "rewards/margins": 8.933996200561523, "rewards/rejected": -11.254470825195312, "step": 4750 }, { "epoch": 2.490842490842491, "grad_norm": 4.436907888082089, "learning_rate": 4.2614886600491115e-08, "logits/chosen": -2.931757688522339, "logits/rejected": -2.88140869140625, "logps/chosen": -298.06732177734375, "logps/rejected": -395.2248840332031, "loss": 0.0076, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.7518365383148193, "rewards/margins": 10.01350212097168, "rewards/rejected": -12.765339851379395, "step": 4760 }, { "epoch": 2.4960753532182105, "grad_norm": 5.945941565366146, "learning_rate": 4.1768566529716415e-08, "logits/chosen": -2.8417134284973145, "logits/rejected": -2.840275287628174, "logps/chosen": -267.5897216796875, "logps/rejected": -343.02490234375, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -3.633164167404175, "rewards/margins": 8.187626838684082, "rewards/rejected": -11.820791244506836, "step": 4770 }, { "epoch": 2.50130821559393, "grad_norm": 5.083289310081902, "learning_rate": 4.0929968171597526e-08, "logits/chosen": -2.779139995574951, "logits/rejected": -2.7463786602020264, "logps/chosen": -291.67840576171875, "logps/rejected": -307.4405212402344, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -2.2331254482269287, "rewards/margins": 9.132110595703125, "rewards/rejected": -11.365236282348633, "step": 4780 }, { "epoch": 2.5065410779696493, "grad_norm": 0.9473216420548509, "learning_rate": 4.009912262333942e-08, "logits/chosen": -2.891561269760132, "logits/rejected": -2.812431812286377, "logps/chosen": -287.28887939453125, "logps/rejected": -361.7596435546875, "loss": 0.0111, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.00136661529541, "rewards/margins": 9.062420845031738, "rewards/rejected": -13.063787460327148, "step": 4790 }, { "epoch": 2.511773940345369, "grad_norm": 9.085662209686403, "learning_rate": 3.927606069465442e-08, "logits/chosen": -2.7818572521209717, "logits/rejected": -2.6530632972717285, "logps/chosen": -328.9610900878906, "logps/rejected": -364.90081787109375, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -2.8505375385284424, "rewards/margins": 10.055088996887207, "rewards/rejected": -12.905624389648438, "step": 4800 }, { "epoch": 2.5170068027210886, "grad_norm": 2.428043162736643, "learning_rate": 3.8460812906620037e-08, "logits/chosen": -2.900851249694824, "logits/rejected": -2.807136058807373, "logps/chosen": -324.77069091796875, "logps/rejected": -391.6881408691406, "loss": 0.0063, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.6806442737579346, "rewards/margins": 9.193876266479492, "rewards/rejected": -11.874518394470215, "step": 4810 }, { "epoch": 2.5222396650968077, "grad_norm": 2.6082292477113556, "learning_rate": 3.765340949054696e-08, "logits/chosen": -2.837953567504883, "logits/rejected": -2.7410359382629395, "logps/chosen": -339.12506103515625, "logps/rejected": -346.45269775390625, "loss": 0.0118, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.548368215560913, "rewards/margins": 8.925511360168457, "rewards/rejected": -11.47387981414795, "step": 4820 }, { "epoch": 2.5274725274725274, "grad_norm": 2.1991439907866823, "learning_rate": 3.685388038685811e-08, "logits/chosen": -2.8498270511627197, "logits/rejected": -2.804253578186035, "logps/chosen": -378.69287109375, "logps/rejected": -446.31671142578125, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -3.290935516357422, "rewards/margins": 9.570276260375977, "rewards/rejected": -12.861211776733398, "step": 4830 }, { "epoch": 2.532705389848247, "grad_norm": 2.11471939224516, "learning_rate": 3.60622552439783e-08, "logits/chosen": -2.7870585918426514, "logits/rejected": -2.720236301422119, "logps/chosen": -295.3894348144531, "logps/rejected": -378.73602294921875, "loss": 0.0187, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.546204090118408, "rewards/margins": 10.001089096069336, "rewards/rejected": -12.547292709350586, "step": 4840 }, { "epoch": 2.5379382522239666, "grad_norm": 7.2545747478366485, "learning_rate": 3.527856341723479e-08, "logits/chosen": -2.7534120082855225, "logits/rejected": -2.7376205921173096, "logps/chosen": -249.927490234375, "logps/rejected": -397.0672607421875, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -4.229075908660889, "rewards/margins": 10.334157943725586, "rewards/rejected": -14.563234329223633, "step": 4850 }, { "epoch": 2.5431711145996863, "grad_norm": 3.6595042487718166, "learning_rate": 3.4502833967768816e-08, "logits/chosen": -2.823514461517334, "logits/rejected": -2.8060412406921387, "logps/chosen": -346.64697265625, "logps/rejected": -391.0193786621094, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -3.0811607837677, "rewards/margins": 9.75478744506836, "rewards/rejected": -12.83594799041748, "step": 4860 }, { "epoch": 2.5484039769754054, "grad_norm": 0.7385561728804345, "learning_rate": 3.373509566145793e-08, "logits/chosen": -2.84466814994812, "logits/rejected": -2.736631393432617, "logps/chosen": -397.4510192871094, "logps/rejected": -403.17218017578125, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -2.7583231925964355, "rewards/margins": 10.043733596801758, "rewards/rejected": -12.802057266235352, "step": 4870 }, { "epoch": 2.553636839351125, "grad_norm": 0.8413684854417715, "learning_rate": 3.2975376967849104e-08, "logits/chosen": -2.8707072734832764, "logits/rejected": -2.770503044128418, "logps/chosen": -288.4997863769531, "logps/rejected": -365.47943115234375, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -2.5793163776397705, "rewards/margins": 8.853147506713867, "rewards/rejected": -11.432462692260742, "step": 4880 }, { "epoch": 2.5588697017268447, "grad_norm": 1.1525087617951097, "learning_rate": 3.222370605910332e-08, "logits/chosen": -2.8407764434814453, "logits/rejected": -2.7939982414245605, "logps/chosen": -315.42645263671875, "logps/rejected": -372.0333251953125, "loss": 0.015, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.9064146280288696, "rewards/margins": 10.422744750976562, "rewards/rejected": -12.329158782958984, "step": 4890 }, { "epoch": 2.564102564102564, "grad_norm": 7.111465694919429, "learning_rate": 3.1480110808950746e-08, "logits/chosen": -2.705634593963623, "logits/rejected": -2.7576904296875, "logps/chosen": -278.1351623535156, "logps/rejected": -410.1261291503906, "loss": 0.0093, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.3511059284210205, "rewards/margins": 9.042301177978516, "rewards/rejected": -12.393407821655273, "step": 4900 }, { "epoch": 2.5693354264782835, "grad_norm": 0.47137948248085515, "learning_rate": 3.07446187916568e-08, "logits/chosen": -2.880405902862549, "logits/rejected": -2.8474526405334473, "logps/chosen": -311.53106689453125, "logps/rejected": -395.5743713378906, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -3.169724225997925, "rewards/margins": 9.470863342285156, "rewards/rejected": -12.640586853027344, "step": 4910 }, { "epoch": 2.574568288854003, "grad_norm": 1.3832571995039906, "learning_rate": 3.001725728100021e-08, "logits/chosen": -2.8667898178100586, "logits/rejected": -2.7911057472229004, "logps/chosen": -336.6524658203125, "logps/rejected": -351.16754150390625, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -3.1967453956604004, "rewards/margins": 9.439695358276367, "rewards/rejected": -12.63644027709961, "step": 4920 }, { "epoch": 2.5798011512297228, "grad_norm": 3.1578463225175364, "learning_rate": 2.9298053249261238e-08, "logits/chosen": -2.788025379180908, "logits/rejected": -2.832801580429077, "logps/chosen": -231.68673706054688, "logps/rejected": -321.92730712890625, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -3.4032485485076904, "rewards/margins": 8.166478157043457, "rewards/rejected": -11.569726943969727, "step": 4930 }, { "epoch": 2.5850340136054424, "grad_norm": 12.282264400073192, "learning_rate": 2.8587033366221534e-08, "logits/chosen": -2.8174407482147217, "logits/rejected": -2.8020496368408203, "logps/chosen": -266.63604736328125, "logps/rejected": -365.22918701171875, "loss": 0.0229, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.5576350688934326, "rewards/margins": 8.906564712524414, "rewards/rejected": -12.464200019836426, "step": 4940 }, { "epoch": 2.5902668759811616, "grad_norm": 2.50894962185282, "learning_rate": 2.7884223998175248e-08, "logits/chosen": -2.8689892292022705, "logits/rejected": -2.807825803756714, "logps/chosen": -271.1974182128906, "logps/rejected": -385.3880310058594, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -3.010326862335205, "rewards/margins": 9.469340324401855, "rewards/rejected": -12.479666709899902, "step": 4950 }, { "epoch": 2.595499738356881, "grad_norm": 2.433165665594961, "learning_rate": 2.718965120695141e-08, "logits/chosen": -2.889346122741699, "logits/rejected": -2.9202868938446045, "logps/chosen": -311.3741149902344, "logps/rejected": -406.0078430175781, "loss": 0.013, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.7230963706970215, "rewards/margins": 9.145428657531738, "rewards/rejected": -11.868524551391602, "step": 4960 }, { "epoch": 2.600732600732601, "grad_norm": 2.2885838804822067, "learning_rate": 2.6503340748947083e-08, "logits/chosen": -2.8939590454101562, "logits/rejected": -2.9163591861724854, "logps/chosen": -309.7974853515625, "logps/rejected": -477.691650390625, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -2.2452571392059326, "rewards/margins": 11.169631958007812, "rewards/rejected": -13.414888381958008, "step": 4970 }, { "epoch": 2.60596546310832, "grad_norm": 3.251305215304493, "learning_rate": 2.5825318074172763e-08, "logits/chosen": -2.955244302749634, "logits/rejected": -2.842569351196289, "logps/chosen": -310.46728515625, "logps/rejected": -378.60400390625, "loss": 0.011, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.7299909591674805, "rewards/margins": 9.322093963623047, "rewards/rejected": -12.052083969116211, "step": 4980 }, { "epoch": 2.6111983254840396, "grad_norm": 3.164278662107775, "learning_rate": 2.5155608325308358e-08, "logits/chosen": -2.884864091873169, "logits/rejected": -2.768899917602539, "logps/chosen": -350.0999755859375, "logps/rejected": -405.120361328125, "loss": 0.007, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.5267579555511475, "rewards/margins": 9.53526496887207, "rewards/rejected": -12.062021255493164, "step": 4990 }, { "epoch": 2.6164311878597593, "grad_norm": 0.5548614506749885, "learning_rate": 2.4494236336770695e-08, "logits/chosen": -2.9105136394500732, "logits/rejected": -2.8631362915039062, "logps/chosen": -299.0857849121094, "logps/rejected": -398.74346923828125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -3.1038718223571777, "rewards/margins": 9.347532272338867, "rewards/rejected": -12.45140552520752, "step": 5000 }, { "epoch": 2.621664050235479, "grad_norm": 7.089149011298612, "learning_rate": 2.3841226633792983e-08, "logits/chosen": -2.8454575538635254, "logits/rejected": -2.736299753189087, "logps/chosen": -344.36773681640625, "logps/rejected": -371.8687744140625, "loss": 0.0061, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.1170573234558105, "rewards/margins": 8.554204940795898, "rewards/rejected": -11.671262741088867, "step": 5010 }, { "epoch": 2.6268969126111985, "grad_norm": 13.578184184139218, "learning_rate": 2.319660343151511e-08, "logits/chosen": -2.8417019844055176, "logits/rejected": -2.796388864517212, "logps/chosen": -292.62542724609375, "logps/rejected": -322.54315185546875, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -2.846231460571289, "rewards/margins": 8.591178894042969, "rewards/rejected": -11.437410354614258, "step": 5020 }, { "epoch": 2.6321297749869177, "grad_norm": 2.193353873211175, "learning_rate": 2.2560390634085715e-08, "logits/chosen": -2.665523052215576, "logits/rejected": -2.6812186241149902, "logps/chosen": -267.73931884765625, "logps/rejected": -435.908447265625, "loss": 0.0148, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.5985560417175293, "rewards/margins": 10.346343040466309, "rewards/rejected": -13.944900512695312, "step": 5030 }, { "epoch": 2.6373626373626373, "grad_norm": 2.295853894631262, "learning_rate": 2.1932611833775843e-08, "logits/chosen": -2.8300464153289795, "logits/rejected": -2.7399096488952637, "logps/chosen": -275.0865173339844, "logps/rejected": -369.3577575683594, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -2.4362025260925293, "rewards/margins": 11.685420036315918, "rewards/rejected": -14.121623039245605, "step": 5040 }, { "epoch": 2.642595499738357, "grad_norm": 25.527802887037115, "learning_rate": 2.1313290310103897e-08, "logits/chosen": -2.824143171310425, "logits/rejected": -2.7426810264587402, "logps/chosen": -260.436279296875, "logps/rejected": -359.41937255859375, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -3.409104824066162, "rewards/margins": 8.756021499633789, "rewards/rejected": -12.165127754211426, "step": 5050 }, { "epoch": 2.647828362114076, "grad_norm": 1.9601288060994437, "learning_rate": 2.0702449028972696e-08, "logits/chosen": -2.7761893272399902, "logits/rejected": -2.805558204650879, "logps/chosen": -304.22271728515625, "logps/rejected": -400.43829345703125, "loss": 0.0133, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.714393138885498, "rewards/margins": 9.458888053894043, "rewards/rejected": -12.173280715942383, "step": 5060 }, { "epoch": 2.6530612244897958, "grad_norm": 0.5976545101138077, "learning_rate": 2.0100110641817547e-08, "logits/chosen": -2.8457772731781006, "logits/rejected": -2.720494031906128, "logps/chosen": -332.1907653808594, "logps/rejected": -381.53564453125, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -3.4612631797790527, "rewards/margins": 11.067359924316406, "rewards/rejected": -14.5286226272583, "step": 5070 }, { "epoch": 2.6582940868655154, "grad_norm": 3.830109824420053, "learning_rate": 1.9506297484766427e-08, "logits/chosen": -2.8959743976593018, "logits/rejected": -2.812014102935791, "logps/chosen": -413.66259765625, "logps/rejected": -344.56915283203125, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -2.8445847034454346, "rewards/margins": 9.176851272583008, "rewards/rejected": -12.021435737609863, "step": 5080 }, { "epoch": 2.663526949241235, "grad_norm": 6.348537796244769, "learning_rate": 1.8921031577811692e-08, "logits/chosen": -2.6365835666656494, "logits/rejected": -2.6018736362457275, "logps/chosen": -295.9486083984375, "logps/rejected": -375.812255859375, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -4.033811569213867, "rewards/margins": 9.356529235839844, "rewards/rejected": -13.390340805053711, "step": 5090 }, { "epoch": 2.6687598116169546, "grad_norm": 1.5489139818860118, "learning_rate": 1.834433462399351e-08, "logits/chosen": -2.8717682361602783, "logits/rejected": -2.779851198196411, "logps/chosen": -317.25079345703125, "logps/rejected": -384.0370788574219, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -3.50787353515625, "rewards/margins": 8.796591758728027, "rewards/rejected": -12.304466247558594, "step": 5100 }, { "epoch": 2.6739926739926743, "grad_norm": 1.4557206257129032, "learning_rate": 1.7776228008594962e-08, "logits/chosen": -2.876269817352295, "logits/rejected": -2.8361306190490723, "logps/chosen": -300.97393798828125, "logps/rejected": -440.462890625, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -3.2657203674316406, "rewards/margins": 9.72622013092041, "rewards/rejected": -12.99194049835205, "step": 5110 }, { "epoch": 2.6792255363683934, "grad_norm": 5.977463984445349, "learning_rate": 1.721673279834926e-08, "logits/chosen": -2.8128435611724854, "logits/rejected": -2.728541851043701, "logps/chosen": -302.1065368652344, "logps/rejected": -368.9043273925781, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -4.328803062438965, "rewards/margins": 9.526824951171875, "rewards/rejected": -13.855628967285156, "step": 5120 }, { "epoch": 2.684458398744113, "grad_norm": 5.73641268650303, "learning_rate": 1.666586974065831e-08, "logits/chosen": -2.84373140335083, "logits/rejected": -2.828235149383545, "logps/chosen": -331.79217529296875, "logps/rejected": -447.68658447265625, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -3.2069172859191895, "rewards/margins": 10.006868362426758, "rewards/rejected": -13.213785171508789, "step": 5130 }, { "epoch": 2.6896912611198327, "grad_norm": 4.2247151951411395, "learning_rate": 1.6123659262823497e-08, "logits/chosen": -2.8010668754577637, "logits/rejected": -2.739643096923828, "logps/chosen": -312.5476379394531, "logps/rejected": -341.955322265625, "loss": 0.0099, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.8190884590148926, "rewards/margins": 9.376364707946777, "rewards/rejected": -12.195453643798828, "step": 5140 }, { "epoch": 2.694924123495552, "grad_norm": 11.16415990233915, "learning_rate": 1.5590121471288104e-08, "logits/chosen": -2.7714552879333496, "logits/rejected": -2.802783489227295, "logps/chosen": -233.0016326904297, "logps/rejected": -359.1783752441406, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -2.5449230670928955, "rewards/margins": 10.992413520812988, "rewards/rejected": -13.537336349487305, "step": 5150 }, { "epoch": 2.7001569858712715, "grad_norm": 1.503626672329318, "learning_rate": 1.5065276150891787e-08, "logits/chosen": -2.7082366943359375, "logits/rejected": -2.686601400375366, "logps/chosen": -277.1235046386719, "logps/rejected": -385.08056640625, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": -4.005666732788086, "rewards/margins": 9.918832778930664, "rewards/rejected": -13.92449951171875, "step": 5160 }, { "epoch": 2.705389848246991, "grad_norm": 1.4090626977254352, "learning_rate": 1.4549142764136768e-08, "logits/chosen": -2.7677760124206543, "logits/rejected": -2.6739232540130615, "logps/chosen": -286.0080871582031, "logps/rejected": -386.0721740722656, "loss": 0.0145, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.9922473430633545, "rewards/margins": 9.870194435119629, "rewards/rejected": -13.862442016601562, "step": 5170 }, { "epoch": 2.7106227106227108, "grad_norm": 0.8865141036924529, "learning_rate": 1.4041740450466383e-08, "logits/chosen": -2.8138980865478516, "logits/rejected": -2.806488513946533, "logps/chosen": -298.46075439453125, "logps/rejected": -399.13458251953125, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -3.6525275707244873, "rewards/margins": 9.79006576538086, "rewards/rejected": -13.442593574523926, "step": 5180 }, { "epoch": 2.7158555729984304, "grad_norm": 0.9193863322580139, "learning_rate": 1.3543088025555094e-08, "logits/chosen": -2.7722737789154053, "logits/rejected": -2.755964994430542, "logps/chosen": -291.847412109375, "logps/rejected": -334.3979797363281, "loss": 0.011, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.0274300575256348, "rewards/margins": 8.683589935302734, "rewards/rejected": -11.711019515991211, "step": 5190 }, { "epoch": 2.7210884353741496, "grad_norm": 2.107229046818004, "learning_rate": 1.3053203980610744e-08, "logits/chosen": -2.7237067222595215, "logits/rejected": -2.737853527069092, "logps/chosen": -341.97076416015625, "logps/rejected": -430.0436096191406, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -2.6192383766174316, "rewards/margins": 10.682634353637695, "rewards/rejected": -13.301872253417969, "step": 5200 }, { "epoch": 2.726321297749869, "grad_norm": 1.0234771320259723, "learning_rate": 1.2572106481689243e-08, "logits/chosen": -2.7983312606811523, "logits/rejected": -2.7162864208221436, "logps/chosen": -272.1782531738281, "logps/rejected": -335.5082702636719, "loss": 0.0156, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.998861312866211, "rewards/margins": 8.353187561035156, "rewards/rejected": -12.352048873901367, "step": 5210 }, { "epoch": 2.731554160125589, "grad_norm": 3.3071687441807374, "learning_rate": 1.2099813369020467e-08, "logits/chosen": -2.888310670852661, "logits/rejected": -2.829333782196045, "logps/chosen": -319.7552185058594, "logps/rejected": -417.4698791503906, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -2.899280548095703, "rewards/margins": 8.995077133178711, "rewards/rejected": -11.89435863494873, "step": 5220 }, { "epoch": 2.736787022501308, "grad_norm": 11.70145991036455, "learning_rate": 1.1636342156346846e-08, "logits/chosen": -2.864396572113037, "logits/rejected": -2.739699125289917, "logps/chosen": -289.35223388671875, "logps/rejected": -363.620849609375, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -3.317915439605713, "rewards/margins": 9.076977729797363, "rewards/rejected": -12.394891738891602, "step": 5230 }, { "epoch": 2.7420198848770276, "grad_norm": 5.849865065816767, "learning_rate": 1.1181710030274043e-08, "logits/chosen": -2.6535065174102783, "logits/rejected": -2.5681209564208984, "logps/chosen": -242.10403442382812, "logps/rejected": -323.10565185546875, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -2.433626651763916, "rewards/margins": 10.48798656463623, "rewards/rejected": -12.921612739562988, "step": 5240 }, { "epoch": 2.7472527472527473, "grad_norm": 2.650317223777701, "learning_rate": 1.0735933849633561e-08, "logits/chosen": -2.844148635864258, "logits/rejected": -2.739320993423462, "logps/chosen": -338.79052734375, "logps/rejected": -353.47125244140625, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -2.8515937328338623, "rewards/margins": 10.031624794006348, "rewards/rejected": -12.883219718933105, "step": 5250 }, { "epoch": 2.752485609628467, "grad_norm": 2.1897635321662112, "learning_rate": 1.0299030144857445e-08, "logits/chosen": -2.7948997020721436, "logits/rejected": -2.8225739002227783, "logps/chosen": -260.75604248046875, "logps/rejected": -371.6756896972656, "loss": 0.0095, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.7150368690490723, "rewards/margins": 10.25691032409668, "rewards/rejected": -13.971946716308594, "step": 5260 }, { "epoch": 2.7577184720041865, "grad_norm": 1.3239883522002978, "learning_rate": 9.871015117365516e-09, "logits/chosen": -2.8112540245056152, "logits/rejected": -2.7941198348999023, "logps/chosen": -260.8721008300781, "logps/rejected": -327.1636047363281, "loss": 0.0087, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.7425599098205566, "rewards/margins": 8.199372291564941, "rewards/rejected": -11.941932678222656, "step": 5270 }, { "epoch": 2.7629513343799057, "grad_norm": 2.164622497139039, "learning_rate": 9.451904638964447e-09, "logits/chosen": -2.84222149848938, "logits/rejected": -2.7562568187713623, "logps/chosen": -342.762939453125, "logps/rejected": -362.12567138671875, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -2.9720563888549805, "rewards/margins": 9.207704544067383, "rewards/rejected": -12.17976188659668, "step": 5280 }, { "epoch": 2.7681841967556253, "grad_norm": 25.89974307650772, "learning_rate": 9.041714251259214e-09, "logits/chosen": -2.7124979496002197, "logits/rejected": -2.573781967163086, "logps/chosen": -322.4733581542969, "logps/rejected": -380.67266845703125, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": -3.8477091789245605, "rewards/margins": 9.666685104370117, "rewards/rejected": -13.51439380645752, "step": 5290 }, { "epoch": 2.773417059131345, "grad_norm": 14.53554269120926, "learning_rate": 8.640459165076857e-09, "logits/chosen": -2.754391670227051, "logits/rejected": -2.842442035675049, "logps/chosen": -242.980712890625, "logps/rejected": -385.0867614746094, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -3.3377766609191895, "rewards/margins": 9.526866912841797, "rewards/rejected": -12.864644050598145, "step": 5300 }, { "epoch": 2.778649921507064, "grad_norm": 13.391763318968158, "learning_rate": 8.248154259902246e-09, "logits/chosen": -2.8207249641418457, "logits/rejected": -2.6634838581085205, "logps/chosen": -319.7501525878906, "logps/rejected": -325.0962219238281, "loss": 0.0109, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.0858235359191895, "rewards/margins": 8.200992584228516, "rewards/rejected": -12.286816596984863, "step": 5310 }, { "epoch": 2.7838827838827838, "grad_norm": 0.8131755661002786, "learning_rate": 7.86481408332651e-09, "logits/chosen": -2.840381145477295, "logits/rejected": -2.724891185760498, "logps/chosen": -261.87213134765625, "logps/rejected": -343.8888244628906, "loss": 0.0092, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.8374130725860596, "rewards/margins": 8.089051246643066, "rewards/rejected": -11.926464080810547, "step": 5320 }, { "epoch": 2.7891156462585034, "grad_norm": 1.5976711518306985, "learning_rate": 7.490452850507506e-09, "logits/chosen": -2.836515426635742, "logits/rejected": -2.796640634536743, "logps/chosen": -294.7411193847656, "logps/rejected": -334.3037109375, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -3.177727222442627, "rewards/margins": 8.285417556762695, "rewards/rejected": -11.46314525604248, "step": 5330 }, { "epoch": 2.794348508634223, "grad_norm": 5.838821184483033, "learning_rate": 7.1250844436426535e-09, "logits/chosen": -2.7620816230773926, "logits/rejected": -2.688870906829834, "logps/chosen": -265.0762023925781, "logps/rejected": -362.812744140625, "loss": 0.0104, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.230181694030762, "rewards/margins": 10.33739948272705, "rewards/rejected": -14.567581176757812, "step": 5340 }, { "epoch": 2.7995813710099426, "grad_norm": 4.819761170126577, "learning_rate": 6.768722411454153e-09, "logits/chosen": -2.764155626296997, "logits/rejected": -2.726027727127075, "logps/chosen": -276.2723083496094, "logps/rejected": -356.1758117675781, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": -3.3997504711151123, "rewards/margins": 9.267010688781738, "rewards/rejected": -12.66676139831543, "step": 5350 }, { "epoch": 2.804814233385662, "grad_norm": 6.778823294329893, "learning_rate": 6.421379968686663e-09, "logits/chosen": -2.916551351547241, "logits/rejected": -2.7838337421417236, "logps/chosen": -401.6727600097656, "logps/rejected": -409.50726318359375, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -2.999206781387329, "rewards/margins": 8.48798656463623, "rewards/rejected": -11.487194061279297, "step": 5360 }, { "epoch": 2.8100470957613815, "grad_norm": 1.1061159774827645, "learning_rate": 6.083069995617113e-09, "logits/chosen": -2.7490057945251465, "logits/rejected": -2.626734733581543, "logps/chosen": -288.5465393066406, "logps/rejected": -363.09710693359375, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -3.640535354614258, "rewards/margins": 9.484700202941895, "rewards/rejected": -13.125234603881836, "step": 5370 }, { "epoch": 2.815279958137101, "grad_norm": 4.9735604622701866, "learning_rate": 5.753805037577192e-09, "logits/chosen": -2.678799867630005, "logits/rejected": -2.7331185340881348, "logps/chosen": -286.4404296875, "logps/rejected": -351.1804504394531, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -3.1906158924102783, "rewards/margins": 8.553506851196289, "rewards/rejected": -11.744123458862305, "step": 5380 }, { "epoch": 2.8205128205128203, "grad_norm": 1.7965041885206476, "learning_rate": 5.433597304488113e-09, "logits/chosen": -2.814871072769165, "logits/rejected": -2.698124885559082, "logps/chosen": -328.62579345703125, "logps/rejected": -441.0506286621094, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -2.7635135650634766, "rewards/margins": 10.54139232635498, "rewards/rejected": -13.304906845092773, "step": 5390 }, { "epoch": 2.82574568288854, "grad_norm": 2.0838353460515426, "learning_rate": 5.122458670407836e-09, "logits/chosen": -2.868966579437256, "logits/rejected": -2.70882248878479, "logps/chosen": -278.3230895996094, "logps/rejected": -293.0477600097656, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -2.8789265155792236, "rewards/margins": 8.013612747192383, "rewards/rejected": -10.892539978027344, "step": 5400 }, { "epoch": 2.8309785452642595, "grad_norm": 5.397402656575744, "learning_rate": 4.820400673090669e-09, "logits/chosen": -2.786712169647217, "logits/rejected": -2.8512330055236816, "logps/chosen": -354.48724365234375, "logps/rejected": -445.27667236328125, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -3.5950660705566406, "rewards/margins": 10.25749683380127, "rewards/rejected": -13.852563858032227, "step": 5410 }, { "epoch": 2.836211407639979, "grad_norm": 1.6194021837453196, "learning_rate": 4.5274345135595525e-09, "logits/chosen": -2.844299077987671, "logits/rejected": -2.80253529548645, "logps/chosen": -377.3062438964844, "logps/rejected": -440.7186584472656, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -2.4788155555725098, "rewards/margins": 11.439664840698242, "rewards/rejected": -13.918481826782227, "step": 5420 }, { "epoch": 2.8414442700156988, "grad_norm": 2.0692775916746555, "learning_rate": 4.243571055690648e-09, "logits/chosen": -2.9225494861602783, "logits/rejected": -2.9030351638793945, "logps/chosen": -377.549560546875, "logps/rejected": -445.652587890625, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -3.2073466777801514, "rewards/margins": 9.48175048828125, "rewards/rejected": -12.689098358154297, "step": 5430 }, { "epoch": 2.846677132391418, "grad_norm": 15.67835967006635, "learning_rate": 3.968820825810431e-09, "logits/chosen": -2.6517739295959473, "logits/rejected": -2.582855463027954, "logps/chosen": -281.6697082519531, "logps/rejected": -330.08782958984375, "loss": 0.0129, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.1345291137695312, "rewards/margins": 9.067094802856445, "rewards/rejected": -12.201624870300293, "step": 5440 }, { "epoch": 2.8519099947671376, "grad_norm": 13.54099538869596, "learning_rate": 3.7031940123053997e-09, "logits/chosen": -2.7190256118774414, "logits/rejected": -2.675426483154297, "logps/chosen": -266.8947448730469, "logps/rejected": -373.1979675292969, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -3.4514358043670654, "rewards/margins": 9.321605682373047, "rewards/rejected": -12.773040771484375, "step": 5450 }, { "epoch": 2.857142857142857, "grad_norm": 0.8779076390877316, "learning_rate": 3.4467004652442842e-09, "logits/chosen": -2.715454578399658, "logits/rejected": -2.647891044616699, "logps/chosen": -241.4197235107422, "logps/rejected": -336.53497314453125, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -2.9433608055114746, "rewards/margins": 9.216439247131348, "rewards/rejected": -12.159799575805664, "step": 5460 }, { "epoch": 2.8623757195185764, "grad_norm": 9.797299951299939, "learning_rate": 3.1993496960127653e-09, "logits/chosen": -2.8034377098083496, "logits/rejected": -2.750270366668701, "logps/chosen": -258.7193298339844, "logps/rejected": -329.56182861328125, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -3.486586332321167, "rewards/margins": 9.221905708312988, "rewards/rejected": -12.708492279052734, "step": 5470 }, { "epoch": 2.867608581894296, "grad_norm": 20.09027444807325, "learning_rate": 2.9611508769606663e-09, "logits/chosen": -2.878671884536743, "logits/rejected": -2.901338577270508, "logps/chosen": -328.79840087890625, "logps/rejected": -388.0645446777344, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -3.583606719970703, "rewards/margins": 8.39130973815918, "rewards/rejected": -11.974916458129883, "step": 5480 }, { "epoch": 2.8728414442700156, "grad_norm": 3.0848783600899283, "learning_rate": 2.7321128410620344e-09, "logits/chosen": -2.7204818725585938, "logits/rejected": -2.5747714042663574, "logps/chosen": -263.08721923828125, "logps/rejected": -320.9984436035156, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -3.559948682785034, "rewards/margins": 8.933120727539062, "rewards/rejected": -12.493070602416992, "step": 5490 }, { "epoch": 2.8780743066457353, "grad_norm": 4.7464605686416474, "learning_rate": 2.5122440815873724e-09, "logits/chosen": -2.8075950145721436, "logits/rejected": -2.690380334854126, "logps/chosen": -354.9125061035156, "logps/rejected": -345.68389892578125, "loss": 0.0095, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.390374183654785, "rewards/margins": 10.080846786499023, "rewards/rejected": -12.471220016479492, "step": 5500 }, { "epoch": 2.883307169021455, "grad_norm": 1.793870559086304, "learning_rate": 2.301552751788838e-09, "logits/chosen": -2.6871190071105957, "logits/rejected": -2.752732753753662, "logps/chosen": -294.37451171875, "logps/rejected": -427.0953674316406, "loss": 0.013, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.813380241394043, "rewards/margins": 9.422821998596191, "rewards/rejected": -12.23620319366455, "step": 5510 }, { "epoch": 2.8885400313971745, "grad_norm": 17.676191723233796, "learning_rate": 2.1000466645978433e-09, "logits/chosen": -2.881307601928711, "logits/rejected": -2.8383331298828125, "logps/chosen": -245.75973510742188, "logps/rejected": -316.96099853515625, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -3.0730700492858887, "rewards/margins": 8.630271911621094, "rewards/rejected": -11.70334243774414, "step": 5520 }, { "epoch": 2.8937728937728937, "grad_norm": 5.529880001116928, "learning_rate": 1.9077332923353728e-09, "logits/chosen": -2.7960972785949707, "logits/rejected": -2.7708797454833984, "logps/chosen": -338.330322265625, "logps/rejected": -409.7406311035156, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -3.0374903678894043, "rewards/margins": 9.465030670166016, "rewards/rejected": -12.502520561218262, "step": 5530 }, { "epoch": 2.8990057561486133, "grad_norm": 16.38786627266072, "learning_rate": 1.7246197664347872e-09, "logits/chosen": -2.927804708480835, "logits/rejected": -2.8698925971984863, "logps/chosen": -327.8984375, "logps/rejected": -490.6431579589844, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -2.7280220985412598, "rewards/margins": 9.227019309997559, "rewards/rejected": -11.955041885375977, "step": 5540 }, { "epoch": 2.904238618524333, "grad_norm": 8.250248744378784, "learning_rate": 1.5507128771775346e-09, "logits/chosen": -2.738314151763916, "logits/rejected": -2.681565999984741, "logps/chosen": -301.24420166015625, "logps/rejected": -391.94708251953125, "loss": 0.0118, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.863482713699341, "rewards/margins": 9.247380256652832, "rewards/rejected": -13.110864639282227, "step": 5550 }, { "epoch": 2.909471480900052, "grad_norm": 3.5914370782109675, "learning_rate": 1.3860190734411858e-09, "logits/chosen": -2.8422555923461914, "logits/rejected": -2.746349811553955, "logps/chosen": -339.1573791503906, "logps/rejected": -415.498046875, "loss": 0.0156, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.85978364944458, "rewards/margins": 8.906829833984375, "rewards/rejected": -11.766613960266113, "step": 5560 }, { "epoch": 2.9147043432757718, "grad_norm": 8.527268687759902, "learning_rate": 1.2305444624604034e-09, "logits/chosen": -2.8913283348083496, "logits/rejected": -2.8799333572387695, "logps/chosen": -327.8100280761719, "logps/rejected": -404.73565673828125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -1.9308582544326782, "rewards/margins": 10.052740097045898, "rewards/rejected": -11.983599662780762, "step": 5570 }, { "epoch": 2.9199372056514914, "grad_norm": 1.3327517832949631, "learning_rate": 1.0842948096004835e-09, "logits/chosen": -2.8032429218292236, "logits/rejected": -2.7707908153533936, "logps/chosen": -279.34405517578125, "logps/rejected": -386.6402282714844, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -3.4768385887145996, "rewards/margins": 9.316797256469727, "rewards/rejected": -12.793634414672852, "step": 5580 }, { "epoch": 2.925170068027211, "grad_norm": 5.175567488948795, "learning_rate": 9.472755381434161e-10, "logits/chosen": -2.7950799465179443, "logits/rejected": -2.6412720680236816, "logps/chosen": -319.254638671875, "logps/rejected": -319.4075012207031, "loss": 0.0131, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.0162642002105713, "rewards/margins": 9.741750717163086, "rewards/rejected": -12.758016586303711, "step": 5590 }, { "epoch": 2.9304029304029307, "grad_norm": 1.3267352579365341, "learning_rate": 8.194917290869907e-10, "logits/chosen": -2.832944393157959, "logits/rejected": -2.7671172618865967, "logps/chosen": -330.9451904296875, "logps/rejected": -394.5552062988281, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": -2.670982837677002, "rewards/margins": 10.517029762268066, "rewards/rejected": -13.188011169433594, "step": 5600 }, { "epoch": 2.93563579277865, "grad_norm": 3.4316641224167936, "learning_rate": 7.009481209561685e-10, "logits/chosen": -2.8592875003814697, "logits/rejected": -2.814638376235962, "logps/chosen": -254.1782989501953, "logps/rejected": -382.9766845703125, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -3.3267829418182373, "rewards/margins": 9.821563720703125, "rewards/rejected": -13.148347854614258, "step": 5610 }, { "epoch": 2.9408686551543695, "grad_norm": 0.5274414311742615, "learning_rate": 5.916491096275845e-10, "logits/chosen": -2.878842830657959, "logits/rejected": -2.8567473888397217, "logps/chosen": -317.9480895996094, "logps/rejected": -411.6915588378906, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": -3.1916182041168213, "rewards/margins": 10.309640884399414, "rewards/rejected": -13.501260757446289, "step": 5620 }, { "epoch": 2.946101517530089, "grad_norm": 1.0873113714544391, "learning_rate": 4.915987481662887e-10, "logits/chosen": -2.7267587184906006, "logits/rejected": -2.6820521354675293, "logps/chosen": -257.244140625, "logps/rejected": -349.4520263671875, "loss": 0.0063, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.4769668579101562, "rewards/margins": 9.612253189086914, "rewards/rejected": -13.089221000671387, "step": 5630 }, { "epoch": 2.9513343799058083, "grad_norm": 1.733984676138519, "learning_rate": 4.0080074667570017e-10, "logits/chosen": -2.828990936279297, "logits/rejected": -2.750608444213867, "logps/chosen": -266.97711181640625, "logps/rejected": -423.57177734375, "loss": 0.0123, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.892110824584961, "rewards/margins": 10.241233825683594, "rewards/rejected": -13.133343696594238, "step": 5640 }, { "epoch": 2.956567242281528, "grad_norm": 1.1254085803227347, "learning_rate": 3.1925847215980017e-10, "logits/chosen": -2.874785900115967, "logits/rejected": -2.776005983352661, "logps/chosen": -299.16448974609375, "logps/rejected": -388.7731628417969, "loss": 0.0116, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.175328254699707, "rewards/margins": 9.321737289428711, "rewards/rejected": -12.497064590454102, "step": 5650 }, { "epoch": 2.9618001046572475, "grad_norm": 13.302242250220985, "learning_rate": 2.469749483985095e-10, "logits/chosen": -2.806129217147827, "logits/rejected": -2.7365965843200684, "logps/chosen": -292.08404541015625, "logps/rejected": -383.8038635253906, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": -3.2635293006896973, "rewards/margins": 9.491117477416992, "rewards/rejected": -12.754648208618164, "step": 5660 }, { "epoch": 2.967032967032967, "grad_norm": 1.8196932333743627, "learning_rate": 1.8395285583530652e-10, "logits/chosen": -2.7935338020324707, "logits/rejected": -2.735308885574341, "logps/chosen": -317.51763916015625, "logps/rejected": -364.37823486328125, "loss": 0.0115, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.067131519317627, "rewards/margins": 9.694609642028809, "rewards/rejected": -12.761739730834961, "step": 5670 }, { "epoch": 2.9722658294086868, "grad_norm": 0.49677788304627546, "learning_rate": 1.3019453147805614e-10, "logits/chosen": -2.8335657119750977, "logits/rejected": -2.7130446434020996, "logps/chosen": -306.3954772949219, "logps/rejected": -406.932373046875, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -3.9488143920898438, "rewards/margins": 10.798115730285645, "rewards/rejected": -14.746930122375488, "step": 5680 }, { "epoch": 2.977498691784406, "grad_norm": 2.3716164342044324, "learning_rate": 8.570196881216297e-11, "logits/chosen": -2.6046571731567383, "logits/rejected": -2.628037691116333, "logps/chosen": -248.31570434570312, "logps/rejected": -374.7004089355469, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -2.5297515392303467, "rewards/margins": 10.090364456176758, "rewards/rejected": -12.620115280151367, "step": 5690 }, { "epoch": 2.9827315541601256, "grad_norm": 3.086099611382726, "learning_rate": 5.0476817726852194e-11, "logits/chosen": -2.780238628387451, "logits/rejected": -2.820456027984619, "logps/chosen": -335.79449462890625, "logps/rejected": -448.4767150878906, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -3.3475914001464844, "rewards/margins": 9.953985214233398, "rewards/rejected": -13.3015775680542, "step": 5700 }, { "epoch": 2.987964416535845, "grad_norm": 2.827267397402498, "learning_rate": 2.4520384453746712e-11, "logits/chosen": -2.717097043991089, "logits/rejected": -2.707285165786743, "logps/chosen": -333.8241271972656, "logps/rejected": -438.07757568359375, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -3.721733570098877, "rewards/margins": 9.45947265625, "rewards/rejected": -13.181207656860352, "step": 5710 }, { "epoch": 2.9931972789115644, "grad_norm": 0.9277276750403733, "learning_rate": 7.833631518627815e-12, "logits/chosen": -2.697326421737671, "logits/rejected": -2.6795403957366943, "logps/chosen": -286.40985107421875, "logps/rejected": -389.8377990722656, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -3.0191245079040527, "rewards/margins": 10.504659652709961, "rewards/rejected": -13.523783683776855, "step": 5720 }, { "epoch": 2.998430141287284, "grad_norm": 8.99264071454252, "learning_rate": 4.1717770565830033e-13, "logits/chosen": -2.862884521484375, "logits/rejected": -2.812479019165039, "logps/chosen": -300.78485107421875, "logps/rejected": -326.2219543457031, "loss": 0.017, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.0574347972869873, "rewards/margins": 8.262727737426758, "rewards/rejected": -11.320161819458008, "step": 5730 }, { "epoch": 3.0, "step": 5733, "total_flos": 0.0, "train_loss": 0.22357279954321543, "train_runtime": 32339.7878, "train_samples_per_second": 5.671, "train_steps_per_second": 0.177 } ], "logging_steps": 10, "max_steps": 5733, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }