{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984301412872841, "eval_steps": 100, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.400390625, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.2547454833984375, "logits/rejected": -2.401865005493164, "logps/chosen": -53.759212493896484, "logps/rejected": -48.83185958862305, "loss": 0.6931, "pred_label": 0.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1, "use_label": 10.0 }, { "epoch": 0.02, "grad_norm": 0.4609375, "learning_rate": 1.0416666666666667e-06, "logits/chosen": -2.2421748638153076, "logits/rejected": -2.2769579887390137, "logps/chosen": -51.987098693847656, "logps/rejected": -64.96717071533203, "loss": 0.6929, "pred_label": 0.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.0019227324519306421, "rewards/margins": 0.0004911368596367538, "rewards/rejected": 0.0014315954176709056, "step": 10, "use_label": 90.0 }, { "epoch": 0.04, "grad_norm": 0.39453125, "learning_rate": 2.0833333333333334e-06, "logits/chosen": -2.2521612644195557, "logits/rejected": -2.255767822265625, "logps/chosen": -62.4937629699707, "logps/rejected": -72.63874816894531, "loss": 0.6919, "pred_label": 0.0, "rewards/accuracies": 0.2874999940395355, "rewards/chosen": 0.01600126549601555, "rewards/margins": 0.0011427802965044975, "rewards/rejected": 0.0148584870621562, "step": 20, "use_label": 242.0 }, { "epoch": 0.06, "grad_norm": 0.51171875, "learning_rate": 3.125e-06, "logits/chosen": -2.3423426151275635, "logits/rejected": -2.3549609184265137, "logps/chosen": -79.10475158691406, "logps/rejected": -98.8157958984375, "loss": 0.6897, "pred_label": 0.0, "rewards/accuracies": 0.28125, "rewards/chosen": 0.03137165680527687, "rewards/margins": 0.0032712810207158327, "rewards/rejected": 0.028100375086069107, "step": 30, "use_label": 402.0 }, { "epoch": 0.08, "grad_norm": 0.51953125, "learning_rate": 4.166666666666667e-06, "logits/chosen": -2.323338270187378, "logits/rejected": -2.3015079498291016, "logps/chosen": -82.85453796386719, "logps/rejected": -82.39984893798828, "loss": 0.6866, "pred_label": 0.0, "rewards/accuracies": 0.2874999940395355, "rewards/chosen": 0.03337595611810684, "rewards/margins": 0.011919925920665264, "rewards/rejected": 0.021456023678183556, "step": 40, "use_label": 562.0 }, { "epoch": 0.1, "grad_norm": 0.671875, "learning_rate": 4.999731868769027e-06, "logits/chosen": -2.2404515743255615, "logits/rejected": -2.262972354888916, "logps/chosen": -67.89888000488281, "logps/rejected": -81.8695068359375, "loss": 0.6805, "pred_label": 0.0, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.009319942444562912, "rewards/margins": 0.030618786811828613, "rewards/rejected": -0.0212988443672657, "step": 50, "use_label": 722.0 }, { "epoch": 0.13, "grad_norm": 1.0234375, "learning_rate": 4.9903533134293035e-06, "logits/chosen": -2.2157275676727295, "logits/rejected": -2.155928134918213, "logps/chosen": -63.64031982421875, "logps/rejected": -73.28236389160156, "loss": 0.6752, "pred_label": 0.0, "rewards/accuracies": 0.3062500059604645, "rewards/chosen": -0.03914070501923561, "rewards/margins": 0.04399287328124046, "rewards/rejected": -0.08313358575105667, "step": 60, "use_label": 882.0 }, { "epoch": 0.15, "grad_norm": 0.859375, "learning_rate": 4.967625656594782e-06, "logits/chosen": -2.114478588104248, "logits/rejected": -2.1126065254211426, "logps/chosen": -70.76527404785156, "logps/rejected": -83.94652557373047, "loss": 0.6712, "pred_label": 0.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.15054164826869965, "rewards/margins": 0.030909737572073936, "rewards/rejected": -0.18145139515399933, "step": 70, "use_label": 1042.0 }, { "epoch": 0.17, "grad_norm": 1.1640625, "learning_rate": 4.93167072587771e-06, "logits/chosen": -2.2166943550109863, "logits/rejected": -2.1609182357788086, "logps/chosen": -54.8065185546875, "logps/rejected": -69.45613861083984, "loss": 0.6589, "pred_label": 0.4749999940395355, "rewards/accuracies": 0.26249998807907104, "rewards/chosen": -0.06275613605976105, "rewards/margins": 0.10003063827753067, "rewards/rejected": -0.16278676688671112, "step": 80, "use_label": 1201.5250244140625 }, { "epoch": 0.19, "grad_norm": 1.8125, "learning_rate": 4.882681251368549e-06, "logits/chosen": -1.9692049026489258, "logits/rejected": -1.9792039394378662, "logps/chosen": -76.60871887207031, "logps/rejected": -96.53330993652344, "loss": 0.6564, "pred_label": 2.0999999046325684, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": -0.18226662278175354, "rewards/margins": 0.09542477130889893, "rewards/rejected": -0.27769142389297485, "step": 90, "use_label": 1359.9000244140625 }, { "epoch": 0.21, "grad_norm": 2.171875, "learning_rate": 4.8209198325401815e-06, "logits/chosen": -1.9027693271636963, "logits/rejected": -1.8775581121444702, "logps/chosen": -92.94733428955078, "logps/rejected": -84.73824310302734, "loss": 0.6531, "pred_label": 4.0, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.12917451560497284, "rewards/margins": 0.07954015582799911, "rewards/rejected": -0.20871467888355255, "step": 100, "use_label": 1518.0 }, { "epoch": 0.21, "eval_logits/chosen": -1.7353737354278564, "eval_logits/rejected": -1.7198325395584106, "eval_logps/chosen": -80.33845520019531, "eval_logps/rejected": -106.64702606201172, "eval_loss": 0.6527961492538452, "eval_pred_label": 6.6875, "eval_rewards/accuracies": 0.36328125, "eval_rewards/chosen": -0.1642620712518692, "eval_rewards/margins": 0.13027876615524292, "eval_rewards/rejected": -0.2945408225059509, "eval_runtime": 125.2319, "eval_samples_per_second": 15.97, "eval_steps_per_second": 0.256, "eval_use_label": 1725.3125, "step": 100 }, { "epoch": 0.23, "grad_norm": 2.0, "learning_rate": 4.746717530629565e-06, "logits/chosen": -1.7974278926849365, "logits/rejected": -1.7697474956512451, "logps/chosen": -89.79286193847656, "logps/rejected": -113.6241455078125, "loss": 0.6479, "pred_label": 9.199999809265137, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.18692079186439514, "rewards/margins": 0.16341358423233032, "rewards/rejected": -0.3503343462944031, "step": 110, "use_label": 1928.800048828125 }, { "epoch": 0.25, "grad_norm": 2.890625, "learning_rate": 4.660472094042121e-06, "logits/chosen": -1.454304814338684, "logits/rejected": -1.3457725048065186, "logps/chosen": -109.3675537109375, "logps/rejected": -133.90725708007812, "loss": 0.6432, "pred_label": 14.949999809265137, "rewards/accuracies": 0.34375, "rewards/chosen": -0.3942197263240814, "rewards/margins": 0.21566259860992432, "rewards/rejected": -0.6098822951316833, "step": 120, "use_label": 2083.050048828125 }, { "epoch": 0.27, "grad_norm": 2.5625, "learning_rate": 4.5626458262912745e-06, "logits/chosen": -1.0859026908874512, "logits/rejected": -1.0426993370056152, "logps/chosen": -112.0394515991211, "logps/rejected": -139.61097717285156, "loss": 0.6391, "pred_label": 21.049999237060547, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.4626106321811676, "rewards/margins": 0.20503444969654083, "rewards/rejected": -0.6676451563835144, "step": 130, "use_label": 2236.949951171875 }, { "epoch": 0.29, "grad_norm": 2.484375, "learning_rate": 4.453763107901676e-06, "logits/chosen": -0.735418975353241, "logits/rejected": -0.8380192518234253, "logps/chosen": -138.07081604003906, "logps/rejected": -150.91665649414062, "loss": 0.6252, "pred_label": 31.399999618530273, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.5732325315475464, "rewards/margins": 0.1448771208524704, "rewards/rejected": -0.7181096076965332, "step": 140, "use_label": 2386.60009765625 }, { "epoch": 0.31, "grad_norm": 3.859375, "learning_rate": 4.33440758555951e-06, "logits/chosen": -0.48231878876686096, "logits/rejected": -0.43882569670677185, "logps/chosen": -117.69664001464844, "logps/rejected": -150.86083984375, "loss": 0.6219, "pred_label": 43.45000076293945, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.5254992246627808, "rewards/margins": 0.3047201633453369, "rewards/rejected": -0.8302194476127625, "step": 150, "use_label": 2534.550048828125 }, { "epoch": 0.33, "grad_norm": 2.890625, "learning_rate": 4.205219043576955e-06, "logits/chosen": -0.15186011791229248, "logits/rejected": -0.17336201667785645, "logps/chosen": -128.78500366210938, "logps/rejected": -159.26498413085938, "loss": 0.5982, "pred_label": 58.25, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.6445494294166565, "rewards/margins": 0.17397476732730865, "rewards/rejected": -0.818524181842804, "step": 160, "use_label": 2679.75 }, { "epoch": 0.36, "grad_norm": 3.328125, "learning_rate": 4.066889974440757e-06, "logits/chosen": 0.14322622120380402, "logits/rejected": 0.18100713193416595, "logps/chosen": -108.39127349853516, "logps/rejected": -140.55824279785156, "loss": 0.5938, "pred_label": 79.57499694824219, "rewards/accuracies": 0.3062500059604645, "rewards/chosen": -0.5288320779800415, "rewards/margins": 0.23454061150550842, "rewards/rejected": -0.7633727192878723, "step": 170, "use_label": 2818.425048828125 }, { "epoch": 0.38, "grad_norm": 3.0, "learning_rate": 3.92016186682789e-06, "logits/chosen": -0.20601686835289001, "logits/rejected": -0.09364790469408035, "logps/chosen": -105.94217681884766, "logps/rejected": -130.695556640625, "loss": 0.6262, "pred_label": 100.2750015258789, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.4315454065799713, "rewards/margins": 0.3125666677951813, "rewards/rejected": -0.7441121339797974, "step": 180, "use_label": 2957.72509765625 }, { "epoch": 0.4, "grad_norm": 2.734375, "learning_rate": 3.7658212309857576e-06, "logits/chosen": -0.34412023425102234, "logits/rejected": -0.07299783080816269, "logps/chosen": -107.5626449584961, "logps/rejected": -141.1322479248047, "loss": 0.6092, "pred_label": 121.05000305175781, "rewards/accuracies": 0.34375, "rewards/chosen": -0.48451024293899536, "rewards/margins": 0.28280580043792725, "rewards/rejected": -0.7673160433769226, "step": 190, "use_label": 3096.949951171875 }, { "epoch": 0.42, "grad_norm": 6.5, "learning_rate": 3.604695382782159e-06, "logits/chosen": 0.03128425031900406, "logits/rejected": 0.20205454528331757, "logps/chosen": -145.35342407226562, "logps/rejected": -162.05667114257812, "loss": 0.6041, "pred_label": 135.89999389648438, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.6367710828781128, "rewards/margins": 0.25234952569007874, "rewards/rejected": -0.8891205787658691, "step": 200, "use_label": 3242.10009765625 }, { "epoch": 0.42, "eval_logits/chosen": 0.886444091796875, "eval_logits/rejected": 0.9784458875656128, "eval_logps/chosen": -135.34742736816406, "eval_logps/rejected": -187.65963745117188, "eval_loss": 0.5936154723167419, "eval_pred_label": 167.40625, "eval_rewards/accuracies": 0.3515625, "eval_rewards/chosen": -0.7143516540527344, "eval_rewards/margins": 0.3903152644634247, "eval_rewards/rejected": -1.1046667098999023, "eval_runtime": 125.3006, "eval_samples_per_second": 15.962, "eval_steps_per_second": 0.255, "eval_use_label": 3420.59375, "step": 200 }, { "epoch": 0.44, "grad_norm": 3.796875, "learning_rate": 3.437648009023905e-06, "logits/chosen": 0.6729141473770142, "logits/rejected": 0.6579598188400269, "logps/chosen": -119.19351959228516, "logps/rejected": -159.00997924804688, "loss": 0.5936, "pred_label": 201.6999969482422, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.63218754529953, "rewards/margins": 0.3281194567680359, "rewards/rejected": -0.9603070020675659, "step": 210, "use_label": 3592.300048828125 }, { "epoch": 0.46, "grad_norm": 4.5, "learning_rate": 3.265574537815398e-06, "logits/chosen": 0.2854166626930237, "logits/rejected": 0.4488348066806793, "logps/chosen": -148.92379760742188, "logps/rejected": -161.19557189941406, "loss": 0.5938, "pred_label": 225.52499389648438, "rewards/accuracies": 0.3125, "rewards/chosen": -0.6929510235786438, "rewards/margins": 0.2553243637084961, "rewards/rejected": -0.9482753872871399, "step": 220, "use_label": 3728.47509765625 }, { "epoch": 0.48, "grad_norm": 2.28125, "learning_rate": 3.089397338773569e-06, "logits/chosen": 0.00020002425299026072, "logits/rejected": 0.1493436098098755, "logps/chosen": -103.05213928222656, "logps/rejected": -136.05099487304688, "loss": 0.597, "pred_label": 252.02499389648438, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.3861756920814514, "rewards/margins": 0.3467464745044708, "rewards/rejected": -0.7329221963882446, "step": 230, "use_label": 3861.97509765625 }, { "epoch": 0.5, "grad_norm": 3.125, "learning_rate": 2.9100607788275547e-06, "logits/chosen": 0.49308425188064575, "logits/rejected": 0.44487372040748596, "logps/chosen": -109.46275329589844, "logps/rejected": -153.8666534423828, "loss": 0.584, "pred_label": 275.375, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.40430212020874023, "rewards/margins": 0.3921273946762085, "rewards/rejected": -0.7964295148849487, "step": 240, "use_label": 3998.625 }, { "epoch": 0.52, "grad_norm": 2.0625, "learning_rate": 2.72852616010567e-06, "logits/chosen": 0.3891890347003937, "logits/rejected": 0.47166162729263306, "logps/chosen": -122.5915298461914, "logps/rejected": -153.12493896484375, "loss": 0.5998, "pred_label": 301.875, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.4919242262840271, "rewards/margins": 0.3470008671283722, "rewards/rejected": -0.8389250636100769, "step": 250, "use_label": 4132.125 }, { "epoch": 0.54, "grad_norm": 2.171875, "learning_rate": 2.5457665670441937e-06, "logits/chosen": 0.4214790463447571, "logits/rejected": 0.4202333092689514, "logps/chosen": -116.09378814697266, "logps/rejected": -156.8458251953125, "loss": 0.592, "pred_label": 326.3500061035156, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.4998815953731537, "rewards/margins": 0.324097216129303, "rewards/rejected": -0.8239787817001343, "step": 260, "use_label": 4267.64990234375 }, { "epoch": 0.57, "grad_norm": 3.40625, "learning_rate": 2.3627616503391813e-06, "logits/chosen": 0.9609361886978149, "logits/rejected": 0.8760908246040344, "logps/chosen": -142.81573486328125, "logps/rejected": -170.10379028320312, "loss": 0.5888, "pred_label": 343.04998779296875, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.698999285697937, "rewards/margins": 0.30828553438186646, "rewards/rejected": -1.0072848796844482, "step": 270, "use_label": 4410.9501953125 }, { "epoch": 0.59, "grad_norm": 2.234375, "learning_rate": 2.1804923757009885e-06, "logits/chosen": 1.1657536029815674, "logits/rejected": 1.3259608745574951, "logps/chosen": -131.4703826904297, "logps/rejected": -156.4979248046875, "loss": 0.6007, "pred_label": 361.5, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.6596485376358032, "rewards/margins": 0.27613458037376404, "rewards/rejected": -0.9357832074165344, "step": 280, "use_label": 4552.5 }, { "epoch": 0.61, "grad_norm": 2.875, "learning_rate": 1.9999357655598894e-06, "logits/chosen": 0.9594011306762695, "logits/rejected": 0.9126796722412109, "logps/chosen": -144.55104064941406, "logps/rejected": -183.51065063476562, "loss": 0.5899, "pred_label": 386.07501220703125, "rewards/accuracies": 0.3187499940395355, "rewards/chosen": -0.7800258994102478, "rewards/margins": 0.2914672791957855, "rewards/rejected": -1.0714932680130005, "step": 290, "use_label": 4687.9248046875 }, { "epoch": 0.63, "grad_norm": 3.1875, "learning_rate": 1.8220596619089576e-06, "logits/chosen": 1.2753574848175049, "logits/rejected": 1.1057071685791016, "logps/chosen": -165.4674072265625, "logps/rejected": -223.6466064453125, "loss": 0.5763, "pred_label": 409.9750061035156, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.8787307739257812, "rewards/margins": 0.41653138399124146, "rewards/rejected": -1.295262098312378, "step": 300, "use_label": 4824.02490234375 }, { "epoch": 0.63, "eval_logits/chosen": 1.6598718166351318, "eval_logits/rejected": 1.7526323795318604, "eval_logps/chosen": -143.2136993408203, "eval_logps/rejected": -200.36146545410156, "eval_loss": 0.5773172974586487, "eval_pred_label": 452.71875, "eval_rewards/accuracies": 0.3515625, "eval_rewards/chosen": -0.7930145263671875, "eval_rewards/margins": 0.4386705756187439, "eval_rewards/rejected": -1.2316851615905762, "eval_runtime": 125.3512, "eval_samples_per_second": 15.955, "eval_steps_per_second": 0.255, "eval_use_label": 4991.28125, "step": 300 }, { "epoch": 0.65, "grad_norm": 3.0625, "learning_rate": 1.647817538357072e-06, "logits/chosen": 1.3793504238128662, "logits/rejected": 1.4072078466415405, "logps/chosen": -126.9173583984375, "logps/rejected": -186.46255493164062, "loss": 0.5633, "pred_label": 494.42498779296875, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.7132076025009155, "rewards/margins": 0.4689141809940338, "rewards/rejected": -1.1821218729019165, "step": 310, "use_label": 5155.5751953125 }, { "epoch": 0.67, "grad_norm": 4.21875, "learning_rate": 1.4781433892011132e-06, "logits/chosen": 1.2615296840667725, "logits/rejected": 1.4717950820922852, "logps/chosen": -163.67529296875, "logps/rejected": -205.421142578125, "loss": 0.5761, "pred_label": 523.5250244140625, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -0.9060484766960144, "rewards/margins": 0.4762052893638611, "rewards/rejected": -1.382253885269165, "step": 320, "use_label": 5286.47509765625 }, { "epoch": 0.69, "grad_norm": 2.359375, "learning_rate": 1.3139467229135999e-06, "logits/chosen": 1.4169238805770874, "logits/rejected": 1.4296729564666748, "logps/chosen": -150.2149200439453, "logps/rejected": -186.73570251464844, "loss": 0.5799, "pred_label": 550.125, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.8010675311088562, "rewards/margins": 0.3802093267440796, "rewards/rejected": -1.1812770366668701, "step": 330, "use_label": 5419.875 }, { "epoch": 0.71, "grad_norm": 3.15625, "learning_rate": 1.1561076868822756e-06, "logits/chosen": 0.9984269142150879, "logits/rejected": 0.9373771548271179, "logps/chosen": -161.85842895507812, "logps/rejected": -182.74703979492188, "loss": 0.5933, "pred_label": 567.2000122070312, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.771192193031311, "rewards/margins": 0.2911759614944458, "rewards/rejected": -1.0623681545257568, "step": 340, "use_label": 5562.7998046875 }, { "epoch": 0.73, "grad_norm": 2.59375, "learning_rate": 1.0054723495346484e-06, "logits/chosen": 0.83796626329422, "logits/rejected": 0.8520887494087219, "logps/chosen": -176.03054809570312, "logps/rejected": -217.10214233398438, "loss": 0.5863, "pred_label": 598.5750122070312, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.86613929271698, "rewards/margins": 0.4522012174129486, "rewards/rejected": -1.318340539932251, "step": 350, "use_label": 5691.4248046875 }, { "epoch": 0.75, "grad_norm": 2.234375, "learning_rate": 8.628481651367876e-07, "logits/chosen": 0.7010875940322876, "logits/rejected": 0.8413160443305969, "logps/chosen": -126.9655532836914, "logps/rejected": -182.5807342529297, "loss": 0.5885, "pred_label": 629.0, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.6332792043685913, "rewards/margins": 0.47024598717689514, "rewards/rejected": -1.103525161743164, "step": 360, "use_label": 5821.0 }, { "epoch": 0.77, "grad_norm": 2.4375, "learning_rate": 7.289996455765749e-07, "logits/chosen": 0.8454801440238953, "logits/rejected": 0.9659041166305542, "logps/chosen": -120.26502990722656, "logps/rejected": -170.44923400878906, "loss": 0.585, "pred_label": 655.625, "rewards/accuracies": 0.34375, "rewards/chosen": -0.5483022928237915, "rewards/margins": 0.4770358204841614, "rewards/rejected": -1.0253381729125977, "step": 370, "use_label": 5954.375 }, { "epoch": 0.8, "grad_norm": 3.5625, "learning_rate": 6.046442623320145e-07, "logits/chosen": 0.7346574664115906, "logits/rejected": 0.7028430104255676, "logps/chosen": -131.0785675048828, "logps/rejected": -188.57435607910156, "loss": 0.589, "pred_label": 685.5250244140625, "rewards/accuracies": 0.3187499940395355, "rewards/chosen": -0.6524317264556885, "rewards/margins": 0.3696710765361786, "rewards/rejected": -1.0221028327941895, "step": 380, "use_label": 6084.47509765625 }, { "epoch": 0.82, "grad_norm": 3.3125, "learning_rate": 4.904486005914027e-07, "logits/chosen": 1.1143369674682617, "logits/rejected": 0.8643951416015625, "logps/chosen": -179.11276245117188, "logps/rejected": -220.11068725585938, "loss": 0.5727, "pred_label": 717.625, "rewards/accuracies": 0.375, "rewards/chosen": -0.8629444241523743, "rewards/margins": 0.508463442325592, "rewards/rejected": -1.3714077472686768, "step": 390, "use_label": 6212.375 }, { "epoch": 0.84, "grad_norm": 3.25, "learning_rate": 3.8702478614051353e-07, "logits/chosen": 0.8043449521064758, "logits/rejected": 0.9917415380477905, "logps/chosen": -130.07017517089844, "logps/rejected": -163.469970703125, "loss": 0.5836, "pred_label": 747.5750122070312, "rewards/accuracies": 0.375, "rewards/chosen": -0.5757918953895569, "rewards/margins": 0.42427974939346313, "rewards/rejected": -1.0000715255737305, "step": 400, "use_label": 6342.4248046875 }, { "epoch": 0.84, "eval_logits/chosen": 1.75760817527771, "eval_logits/rejected": 1.8499951362609863, "eval_logps/chosen": -130.3719482421875, "eval_logps/rejected": -190.7267303466797, "eval_loss": 0.5768851041793823, "eval_pred_label": 782.8125, "eval_rewards/accuracies": 0.37109375, "eval_rewards/chosen": -0.6645968556404114, "eval_rewards/margins": 0.4707409739494324, "eval_rewards/rejected": -1.1353378295898438, "eval_runtime": 147.391, "eval_samples_per_second": 13.569, "eval_steps_per_second": 0.217, "eval_use_label": 6517.1875, "step": 400 }, { "epoch": 0.86, "grad_norm": 3.390625, "learning_rate": 2.9492720416985004e-07, "logits/chosen": 1.1002473831176758, "logits/rejected": 1.1428117752075195, "logps/chosen": -126.85247802734375, "logps/rejected": -170.77365112304688, "loss": 0.5838, "pred_label": 822.75, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.6542948484420776, "rewards/margins": 0.4562492370605469, "rewards/rejected": -1.1105440855026245, "step": 410, "use_label": 6683.25 }, { "epoch": 0.88, "grad_norm": 2.609375, "learning_rate": 2.1464952759020857e-07, "logits/chosen": 1.3246395587921143, "logits/rejected": 1.2824434041976929, "logps/chosen": -122.80003356933594, "logps/rejected": -138.56423950195312, "loss": 0.5822, "pred_label": 846.4249877929688, "rewards/accuracies": 0.26249998807907104, "rewards/chosen": -0.6186091303825378, "rewards/margins": 0.25320303440093994, "rewards/rejected": -0.8718121647834778, "step": 420, "use_label": 6819.5751953125 }, { "epoch": 0.9, "grad_norm": 2.09375, "learning_rate": 1.4662207078575685e-07, "logits/chosen": 1.270193099975586, "logits/rejected": 1.253873348236084, "logps/chosen": -171.46336364746094, "logps/rejected": -207.75607299804688, "loss": 0.564, "pred_label": 873.5499877929688, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.7219651341438293, "rewards/margins": 0.5620242357254028, "rewards/rejected": -1.283989429473877, "step": 430, "use_label": 6952.4501953125 }, { "epoch": 0.92, "grad_norm": 2.578125, "learning_rate": 9.120948298936422e-08, "logits/chosen": 1.221411943435669, "logits/rejected": 1.397247552871704, "logps/chosen": -136.4575653076172, "logps/rejected": -193.40870666503906, "loss": 0.5736, "pred_label": 905.5750122070312, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.6955360770225525, "rewards/margins": 0.4879213869571686, "rewards/rejected": -1.183457612991333, "step": 440, "use_label": 7080.4248046875 }, { "epoch": 0.94, "grad_norm": 4.78125, "learning_rate": 4.870879364444109e-08, "logits/chosen": 1.6054052114486694, "logits/rejected": 1.3484258651733398, "logps/chosen": -148.17161560058594, "logps/rejected": -205.789306640625, "loss": 0.583, "pred_label": 930.4749755859375, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.7591380476951599, "rewards/margins": 0.4158584177494049, "rewards/rejected": -1.1749964952468872, "step": 450, "use_label": 7215.52490234375 }, { "epoch": 0.96, "grad_norm": 2.875, "learning_rate": 1.93478202307823e-08, "logits/chosen": 1.4640157222747803, "logits/rejected": 1.4903802871704102, "logps/chosen": -96.6323471069336, "logps/rejected": -150.8868865966797, "loss": 0.5814, "pred_label": 961.25, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.5051247477531433, "rewards/margins": 0.3702928125858307, "rewards/rejected": -0.8754175901412964, "step": 460, "use_label": 7344.75 }, { "epoch": 0.98, "grad_norm": 2.75, "learning_rate": 3.283947088983663e-09, "logits/chosen": 1.464422345161438, "logits/rejected": 1.2297132015228271, "logps/chosen": -130.30838012695312, "logps/rejected": -166.67605590820312, "loss": 0.5822, "pred_label": 982.8499755859375, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.6297920346260071, "rewards/margins": 0.34639838337898254, "rewards/rejected": -0.9761903882026672, "step": 470, "use_label": 7483.14990234375 }, { "epoch": 1.0, "step": 477, "total_flos": 0.0, "train_loss": 0.6110695428068533, "train_runtime": 9999.3279, "train_samples_per_second": 6.114, "train_steps_per_second": 0.048 } ], "logging_steps": 10, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }