{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.662871673454485, "learning_rate": 1.3054830287206268e-08, "logits/chosen": -2.756725311279297, "logits/rejected": -2.8459553718566895, "logps/chosen": -183.9856719970703, "logps/rejected": -240.6292724609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 1.8534468064229688, "learning_rate": 1.3054830287206266e-07, "logits/chosen": -2.8642470836639404, "logits/rejected": -2.739949941635132, "logps/chosen": -287.012451171875, "logps/rejected": -190.21405029296875, "loss": 0.6932, "rewards/accuracies": 0.3611111044883728, "rewards/chosen": -3.983284841524437e-05, "rewards/margins": 2.287545612489339e-05, "rewards/margins_max": 0.0017433927860110998, "rewards/margins_min": -0.0018777759978547692, "rewards/margins_std": 0.0016894970322027802, "rewards/rejected": -6.2708328187e-05, "step": 10 }, { "epoch": 0.01, "grad_norm": 1.964624168912796, "learning_rate": 2.610966057441253e-07, "logits/chosen": -2.9036612510681152, "logits/rejected": -2.835484027862549, "logps/chosen": -350.17779541015625, "logps/rejected": -270.02374267578125, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 8.417033677687868e-05, "rewards/margins": 0.0004610268515534699, "rewards/margins_max": 0.003191339550539851, "rewards/margins_min": -0.0020411338191479445, "rewards/margins_std": 0.0023524309508502483, "rewards/rejected": -0.0003768565657082945, "step": 20 }, { "epoch": 0.01, "grad_norm": 2.186801471896851, "learning_rate": 3.9164490861618804e-07, "logits/chosen": -2.8327291011810303, "logits/rejected": -2.8474068641662598, "logps/chosen": -251.2904510498047, "logps/rejected": -251.9474334716797, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.000294469587970525, "rewards/margins": 0.0004858509637415409, "rewards/margins_max": 0.003673089202493429, "rewards/margins_min": -0.0028143501840531826, "rewards/margins_std": 0.002925784792751074, "rewards/rejected": -0.00019138141942676157, "step": 30 }, { "epoch": 0.01, "grad_norm": 2.139844988975431, "learning_rate": 5.221932114882506e-07, "logits/chosen": -2.803647518157959, "logits/rejected": -2.8051300048828125, "logps/chosen": -225.2307891845703, "logps/rejected": -243.1562957763672, "loss": 0.6927, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00033599851303733885, "rewards/margins": 0.0008187214843928814, "rewards/margins_max": 0.004225071519613266, "rewards/margins_min": -0.002697467803955078, "rewards/margins_std": 0.0030108396895229816, "rewards/rejected": -0.00048272297135554254, "step": 40 }, { "epoch": 0.01, "grad_norm": 2.037838155382851, "learning_rate": 6.527415143603135e-07, "logits/chosen": -2.955798864364624, "logits/rejected": -2.9171156883239746, "logps/chosen": -341.2575988769531, "logps/rejected": -307.1576843261719, "loss": 0.6929, "rewards/accuracies": 0.625, "rewards/chosen": 0.00023354284348897636, "rewards/margins": 0.0001884822704596445, "rewards/margins_max": 0.004373225849121809, "rewards/margins_min": -0.004892234690487385, "rewards/margins_std": 0.004206312354654074, "rewards/rejected": 4.5060645788908005e-05, "step": 50 }, { "epoch": 0.02, "grad_norm": 1.9425370359879095, "learning_rate": 7.832898172323761e-07, "logits/chosen": -2.7383322715759277, "logits/rejected": -2.6866185665130615, "logps/chosen": -247.27554321289062, "logps/rejected": -251.7075958251953, "loss": 0.6921, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.001414769678376615, "rewards/margins": 0.0020836442708969116, "rewards/margins_max": 0.006741675548255444, "rewards/margins_min": -0.0017913647461682558, "rewards/margins_std": 0.0040467288345098495, "rewards/rejected": -0.0006688747671432793, "step": 60 }, { "epoch": 0.02, "grad_norm": 1.632266377648927, "learning_rate": 9.138381201044387e-07, "logits/chosen": -2.8499655723571777, "logits/rejected": -2.8122572898864746, "logps/chosen": -260.5271911621094, "logps/rejected": -244.722900390625, "loss": 0.6921, "rewards/accuracies": 0.625, "rewards/chosen": 0.002009602263569832, "rewards/margins": 0.0028479404281824827, "rewards/margins_max": 0.011465233750641346, "rewards/margins_min": -0.00462332321330905, "rewards/margins_std": 0.007065328769385815, "rewards/rejected": -0.000838338048197329, "step": 70 }, { "epoch": 0.02, "grad_norm": 2.503501538019639, "learning_rate": 1.0443864229765013e-06, "logits/chosen": -2.7461657524108887, "logits/rejected": -2.7756741046905518, "logps/chosen": -296.9421081542969, "logps/rejected": -234.82177734375, "loss": 0.6918, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0019799198489636183, "rewards/margins": 0.0037876334972679615, "rewards/margins_max": 0.011122383177280426, "rewards/margins_min": -0.003949443809688091, "rewards/margins_std": 0.007009509019553661, "rewards/rejected": -0.0018077135318890214, "step": 80 }, { "epoch": 0.02, "grad_norm": 1.819898631972371, "learning_rate": 1.1749347258485642e-06, "logits/chosen": -2.9606547355651855, "logits/rejected": -2.964646816253662, "logps/chosen": -356.0566101074219, "logps/rejected": -324.49285888671875, "loss": 0.6908, "rewards/accuracies": 0.75, "rewards/chosen": 0.0036299112252891064, "rewards/margins": 0.007626961916685104, "rewards/margins_max": 0.01806723326444626, "rewards/margins_min": -0.0030308912973850965, "rewards/margins_std": 0.009542394429445267, "rewards/rejected": -0.003997050225734711, "step": 90 }, { "epoch": 0.03, "grad_norm": 2.151878680542638, "learning_rate": 1.305483028720627e-06, "logits/chosen": -2.7032153606414795, "logits/rejected": -2.6796319484710693, "logps/chosen": -298.008544921875, "logps/rejected": -227.78213500976562, "loss": 0.6893, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0027445759624242783, "rewards/margins": 0.007853394374251366, "rewards/margins_max": 0.02053624577820301, "rewards/margins_min": -0.005504676606506109, "rewards/margins_std": 0.011628041043877602, "rewards/rejected": -0.005108819343149662, "step": 100 }, { "epoch": 0.03, "eval_logits/chosen": -2.819959878921509, "eval_logits/rejected": -2.78220796585083, "eval_logps/chosen": -284.2243957519531, "eval_logps/rejected": -262.69573974609375, "eval_loss": 0.6896771192550659, "eval_rewards/accuracies": 0.7202380895614624, "eval_rewards/chosen": 0.0026296868454664946, "eval_rewards/margins": 0.008158940821886063, "eval_rewards/margins_max": 0.03619640693068504, "eval_rewards/margins_min": -0.017027264460921288, "eval_rewards/margins_std": 0.017587358132004738, "eval_rewards/rejected": -0.005529253743588924, "eval_runtime": 224.9828, "eval_samples_per_second": 8.89, "eval_steps_per_second": 0.28, "step": 100 }, { "epoch": 0.03, "grad_norm": 2.1731178514847396, "learning_rate": 1.4360313315926894e-06, "logits/chosen": -2.82112455368042, "logits/rejected": -2.808168888092041, "logps/chosen": -266.538330078125, "logps/rejected": -264.3177795410156, "loss": 0.6896, "rewards/accuracies": 0.625, "rewards/chosen": 0.002133150352165103, "rewards/margins": 0.005869985092431307, "rewards/margins_max": 0.026839394122362137, "rewards/margins_min": -0.016449477523565292, "rewards/margins_std": 0.02000562474131584, "rewards/rejected": -0.0037368338089436293, "step": 110 }, { "epoch": 0.03, "grad_norm": 2.152859440830417, "learning_rate": 1.5665796344647521e-06, "logits/chosen": -2.813021183013916, "logits/rejected": -2.7906970977783203, "logps/chosen": -249.02392578125, "logps/rejected": -240.3574981689453, "loss": 0.6898, "rewards/accuracies": 0.75, "rewards/chosen": -5.239611346041784e-05, "rewards/margins": 0.007097178604453802, "rewards/margins_max": 0.027792206034064293, "rewards/margins_min": -0.00938634667545557, "rewards/margins_std": 0.01686152257025242, "rewards/rejected": -0.007149574812501669, "step": 120 }, { "epoch": 0.03, "grad_norm": 7.187064248849445, "learning_rate": 1.6971279373368146e-06, "logits/chosen": -2.7727582454681396, "logits/rejected": -2.7303881645202637, "logps/chosen": -279.2630310058594, "logps/rejected": -394.32427978515625, "loss": 0.6871, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.002511128317564726, "rewards/margins": 0.015176082029938698, "rewards/margins_max": 0.06492110341787338, "rewards/margins_min": -0.018980490043759346, "rewards/margins_std": 0.038812872022390366, "rewards/rejected": -0.01266495417803526, "step": 130 }, { "epoch": 0.04, "grad_norm": 1.8565638472727632, "learning_rate": 1.8276762402088774e-06, "logits/chosen": -2.7534339427948, "logits/rejected": -2.7199196815490723, "logps/chosen": -233.44082641601562, "logps/rejected": -218.2131805419922, "loss": 0.6876, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0075444066897034645, "rewards/margins": 0.018097033724188805, "rewards/margins_max": 0.05744786188006401, "rewards/margins_min": -0.009086047299206257, "rewards/margins_std": 0.029222065582871437, "rewards/rejected": -0.01055262703448534, "step": 140 }, { "epoch": 0.04, "grad_norm": 1.877242273452714, "learning_rate": 1.9582245430809403e-06, "logits/chosen": -2.7686474323272705, "logits/rejected": -2.749394178390503, "logps/chosen": -218.5071563720703, "logps/rejected": -250.15121459960938, "loss": 0.6862, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0062075452879071236, "rewards/margins": 0.01664603129029274, "rewards/margins_max": 0.05084766075015068, "rewards/margins_min": -0.016823524609208107, "rewards/margins_std": 0.030566370114684105, "rewards/rejected": -0.010438486933708191, "step": 150 }, { "epoch": 0.04, "grad_norm": 1.7107220518423407, "learning_rate": 2.0887728459530026e-06, "logits/chosen": -2.826704263687134, "logits/rejected": -2.8188796043395996, "logps/chosen": -262.1845703125, "logps/rejected": -229.36599731445312, "loss": 0.6802, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.02239852584898472, "rewards/margins": 0.03223006799817085, "rewards/margins_max": 0.08928605914115906, "rewards/margins_min": -0.00262450217269361, "rewards/margins_std": 0.04135637357831001, "rewards/rejected": -0.009831544943153858, "step": 160 }, { "epoch": 0.04, "grad_norm": 1.932637542725359, "learning_rate": 2.2193211488250653e-06, "logits/chosen": -2.743253231048584, "logits/rejected": -2.7205233573913574, "logps/chosen": -243.86746215820312, "logps/rejected": -220.6022186279297, "loss": 0.6811, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0148014472797513, "rewards/margins": 0.02474537119269371, "rewards/margins_max": 0.07947840541601181, "rewards/margins_min": -0.033394504338502884, "rewards/margins_std": 0.04944537207484245, "rewards/rejected": -0.009943926706910133, "step": 170 }, { "epoch": 0.05, "grad_norm": 2.043683308204429, "learning_rate": 2.3498694516971284e-06, "logits/chosen": -2.7653040885925293, "logits/rejected": -2.7112600803375244, "logps/chosen": -311.2255859375, "logps/rejected": -246.7816162109375, "loss": 0.6776, "rewards/accuracies": 0.625, "rewards/chosen": 0.01917579583823681, "rewards/margins": 0.039689820259809494, "rewards/margins_max": 0.1179848313331604, "rewards/margins_min": -0.033915892243385315, "rewards/margins_std": 0.06523092091083527, "rewards/rejected": -0.020514028146862984, "step": 180 }, { "epoch": 0.05, "grad_norm": 2.2897679410402962, "learning_rate": 2.4804177545691907e-06, "logits/chosen": -2.8693976402282715, "logits/rejected": -2.814147472381592, "logps/chosen": -279.70611572265625, "logps/rejected": -214.2100830078125, "loss": 0.6751, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.032600294798612595, "rewards/margins": 0.049894630908966064, "rewards/margins_max": 0.16118279099464417, "rewards/margins_min": -0.053312502801418304, "rewards/margins_std": 0.09953655302524567, "rewards/rejected": -0.01729433797299862, "step": 190 }, { "epoch": 0.05, "grad_norm": 2.2366050848558463, "learning_rate": 2.610966057441254e-06, "logits/chosen": -2.789443254470825, "logits/rejected": -2.7356107234954834, "logps/chosen": -319.79071044921875, "logps/rejected": -257.9158020019531, "loss": 0.6681, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.032422568649053574, "rewards/margins": 0.06155448034405708, "rewards/margins_max": 0.15893979370594025, "rewards/margins_min": -0.01412617601454258, "rewards/margins_std": 0.07764186710119247, "rewards/rejected": -0.02913190983235836, "step": 200 }, { "epoch": 0.05, "eval_logits/chosen": -2.790609121322632, "eval_logits/rejected": -2.7520194053649902, "eval_logps/chosen": -282.86865234375, "eval_logps/rejected": -266.4324951171875, "eval_loss": 0.6688757538795471, "eval_rewards/accuracies": 0.7222222089767456, "eval_rewards/chosen": 0.016186822205781937, "eval_rewards/margins": 0.059083521366119385, "eval_rewards/margins_max": 0.24039579927921295, "eval_rewards/margins_min": -0.11280132085084915, "eval_rewards/margins_std": 0.11628948152065277, "eval_rewards/rejected": -0.042896706610918045, "eval_runtime": 223.818, "eval_samples_per_second": 8.936, "eval_steps_per_second": 0.281, "step": 200 }, { "epoch": 0.05, "grad_norm": 2.8498054839561804, "learning_rate": 2.741514360313316e-06, "logits/chosen": -2.7659912109375, "logits/rejected": -2.7181544303894043, "logps/chosen": -337.74090576171875, "logps/rejected": -280.05279541015625, "loss": 0.6657, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.033216312527656555, "rewards/margins": 0.07913878560066223, "rewards/margins_max": 0.2607572376728058, "rewards/margins_min": -0.054263632744550705, "rewards/margins_std": 0.13968484103679657, "rewards/rejected": -0.04592246934771538, "step": 210 }, { "epoch": 0.06, "grad_norm": 2.064597479357664, "learning_rate": 2.872062663185379e-06, "logits/chosen": -2.7045555114746094, "logits/rejected": -2.646944761276245, "logps/chosen": -264.3805847167969, "logps/rejected": -303.3786315917969, "loss": 0.665, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.016354570165276527, "rewards/margins": 0.08234158158302307, "rewards/margins_max": 0.21536795794963837, "rewards/margins_min": -0.045636437833309174, "rewards/margins_std": 0.11874409765005112, "rewards/rejected": -0.0659870132803917, "step": 220 }, { "epoch": 0.06, "grad_norm": 2.4303378004591916, "learning_rate": 3.0026109660574416e-06, "logits/chosen": -2.8186447620391846, "logits/rejected": -2.791952610015869, "logps/chosen": -304.4600830078125, "logps/rejected": -258.8472595214844, "loss": 0.6541, "rewards/accuracies": 0.625, "rewards/chosen": 0.030647989362478256, "rewards/margins": 0.07532333582639694, "rewards/margins_max": 0.2734847664833069, "rewards/margins_min": -0.1370028853416443, "rewards/margins_std": 0.18644869327545166, "rewards/rejected": -0.044675346463918686, "step": 230 }, { "epoch": 0.06, "grad_norm": 2.7792356247093926, "learning_rate": 3.1331592689295043e-06, "logits/chosen": -2.6890745162963867, "logits/rejected": -2.567110300064087, "logps/chosen": -276.08404541015625, "logps/rejected": -237.46401977539062, "loss": 0.6352, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0010208163876086473, "rewards/margins": 0.12248816341161728, "rewards/margins_max": 0.3909008502960205, "rewards/margins_min": -0.11146117746829987, "rewards/margins_std": 0.2260712832212448, "rewards/rejected": -0.12146735191345215, "step": 240 }, { "epoch": 0.07, "grad_norm": 2.791941292383267, "learning_rate": 3.263707571801567e-06, "logits/chosen": -2.735582113265991, "logits/rejected": -2.757213592529297, "logps/chosen": -287.4705810546875, "logps/rejected": -272.218505859375, "loss": 0.6387, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.01997445337474346, "rewards/margins": 0.16604064404964447, "rewards/margins_max": 0.4254993498325348, "rewards/margins_min": -0.08604441583156586, "rewards/margins_std": 0.22610628604888916, "rewards/rejected": -0.18601509928703308, "step": 250 }, { "epoch": 0.07, "grad_norm": 3.942270688854851, "learning_rate": 3.3942558746736293e-06, "logits/chosen": -2.764155864715576, "logits/rejected": -2.730841875076294, "logps/chosen": -354.1364440917969, "logps/rejected": -347.0375671386719, "loss": 0.629, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0299435555934906, "rewards/margins": 0.18580618500709534, "rewards/margins_max": 0.5874249935150146, "rewards/margins_min": -0.16159002482891083, "rewards/margins_std": 0.32245373725891113, "rewards/rejected": -0.21574974060058594, "step": 260 }, { "epoch": 0.07, "grad_norm": 5.17624408307966, "learning_rate": 3.524804177545692e-06, "logits/chosen": -2.728912591934204, "logits/rejected": -2.6732656955718994, "logps/chosen": -343.31219482421875, "logps/rejected": -328.41424560546875, "loss": 0.6102, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.17557600140571594, "rewards/margins": 0.10843143612146378, "rewards/margins_max": 0.45560845732688904, "rewards/margins_min": -0.2370292693376541, "rewards/margins_std": 0.313829243183136, "rewards/rejected": -0.28400740027427673, "step": 270 }, { "epoch": 0.07, "grad_norm": 4.6112164455980915, "learning_rate": 3.6553524804177547e-06, "logits/chosen": -2.8274950981140137, "logits/rejected": -2.7835533618927, "logps/chosen": -287.8620910644531, "logps/rejected": -271.0885009765625, "loss": 0.6409, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09691455215215683, "rewards/margins": 0.10122760385274887, "rewards/margins_max": 0.5490546226501465, "rewards/margins_min": -0.3631536364555359, "rewards/margins_std": 0.4078657627105713, "rewards/rejected": -0.1981421709060669, "step": 280 }, { "epoch": 0.08, "grad_norm": 3.8630514201890214, "learning_rate": 3.7859007832898174e-06, "logits/chosen": -2.7565560340881348, "logits/rejected": -2.7167704105377197, "logps/chosen": -334.09564208984375, "logps/rejected": -288.7518310546875, "loss": 0.6341, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3300151824951172, "rewards/margins": 0.15565776824951172, "rewards/margins_max": 0.4868637025356293, "rewards/margins_min": -0.15161745250225067, "rewards/margins_std": 0.28800469636917114, "rewards/rejected": -0.4856729507446289, "step": 290 }, { "epoch": 0.08, "grad_norm": 3.8818105099403044, "learning_rate": 3.9164490861618806e-06, "logits/chosen": -2.826871156692505, "logits/rejected": -2.7986605167388916, "logps/chosen": -322.3857421875, "logps/rejected": -284.9024353027344, "loss": 0.64, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2539197504520416, "rewards/margins": 0.19142434000968933, "rewards/margins_max": 0.554625391960144, "rewards/margins_min": -0.1596948802471161, "rewards/margins_std": 0.3315952718257904, "rewards/rejected": -0.44534412026405334, "step": 300 }, { "epoch": 0.08, "eval_logits/chosen": -2.764376163482666, "eval_logits/rejected": -2.7294154167175293, "eval_logps/chosen": -318.2889404296875, "eval_logps/rejected": -314.9070739746094, "eval_loss": 0.6292702555656433, "eval_rewards/accuracies": 0.704365074634552, "eval_rewards/chosen": -0.3380163908004761, "eval_rewards/margins": 0.18962617218494415, "eval_rewards/margins_max": 0.7935211062431335, "eval_rewards/margins_min": -0.3660690188407898, "eval_rewards/margins_std": 0.38797318935394287, "eval_rewards/rejected": -0.527642548084259, "eval_runtime": 223.8484, "eval_samples_per_second": 8.935, "eval_steps_per_second": 0.281, "step": 300 }, { "epoch": 0.08, "grad_norm": 5.903108275891145, "learning_rate": 4.046997389033943e-06, "logits/chosen": -2.8098020553588867, "logits/rejected": -2.760925769805908, "logps/chosen": -315.0646667480469, "logps/rejected": -357.49713134765625, "loss": 0.621, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.26933008432388306, "rewards/margins": 0.23711815476417542, "rewards/margins_max": 0.7644354104995728, "rewards/margins_min": -0.10883345454931259, "rewards/margins_std": 0.3865162432193756, "rewards/rejected": -0.5064482092857361, "step": 310 }, { "epoch": 0.08, "grad_norm": 2.6994225167015884, "learning_rate": 4.177545691906005e-06, "logits/chosen": -2.84281587600708, "logits/rejected": -2.8172855377197266, "logps/chosen": -311.79571533203125, "logps/rejected": -294.6600341796875, "loss": 0.6192, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.020808199420571327, "rewards/margins": 0.2602817416191101, "rewards/margins_max": 0.8408046960830688, "rewards/margins_min": -0.2624954283237457, "rewards/margins_std": 0.4814014434814453, "rewards/rejected": -0.2810899615287781, "step": 320 }, { "epoch": 0.09, "grad_norm": 9.898902261928386, "learning_rate": 4.308093994778068e-06, "logits/chosen": -2.7739417552948, "logits/rejected": -2.716782331466675, "logps/chosen": -297.8214111328125, "logps/rejected": -323.0215148925781, "loss": 0.6212, "rewards/accuracies": 0.75, "rewards/chosen": -0.10257480293512344, "rewards/margins": 0.20737743377685547, "rewards/margins_max": 0.6785110235214233, "rewards/margins_min": -0.12330939620733261, "rewards/margins_std": 0.363497257232666, "rewards/rejected": -0.3099522292613983, "step": 330 }, { "epoch": 0.09, "grad_norm": 4.086674056759532, "learning_rate": 4.4386422976501306e-06, "logits/chosen": -2.6274378299713135, "logits/rejected": -2.68204927444458, "logps/chosen": -333.5520935058594, "logps/rejected": -342.832275390625, "loss": 0.6088, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.09552483260631561, "rewards/margins": 0.34924525022506714, "rewards/margins_max": 0.8854178190231323, "rewards/margins_min": -0.05615895241498947, "rewards/margins_std": 0.4259135127067566, "rewards/rejected": -0.44477003812789917, "step": 340 }, { "epoch": 0.09, "grad_norm": 5.671423019181341, "learning_rate": 4.569190600522193e-06, "logits/chosen": -2.630596876144409, "logits/rejected": -2.6135263442993164, "logps/chosen": -335.6517333984375, "logps/rejected": -290.89520263671875, "loss": 0.6053, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.26570630073547363, "rewards/margins": 0.3015943765640259, "rewards/margins_max": 0.9441448450088501, "rewards/margins_min": -0.3666132092475891, "rewards/margins_std": 0.5693005919456482, "rewards/rejected": -0.5673006772994995, "step": 350 }, { "epoch": 0.09, "grad_norm": 3.031985793329538, "learning_rate": 4.699738903394257e-06, "logits/chosen": -2.808351516723633, "logits/rejected": -2.7869908809661865, "logps/chosen": -433.953369140625, "logps/rejected": -373.64154052734375, "loss": 0.6333, "rewards/accuracies": 0.75, "rewards/chosen": -0.5681406855583191, "rewards/margins": 0.30244189500808716, "rewards/margins_max": 0.7779091000556946, "rewards/margins_min": -0.031546033918857574, "rewards/margins_std": 0.36682647466659546, "rewards/rejected": -0.8705824613571167, "step": 360 }, { "epoch": 0.1, "grad_norm": 3.736528741183641, "learning_rate": 4.8302872062663196e-06, "logits/chosen": -2.4879283905029297, "logits/rejected": -2.5234522819519043, "logps/chosen": -291.1069030761719, "logps/rejected": -343.3514099121094, "loss": 0.6625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7694433331489563, "rewards/margins": 0.14212694764137268, "rewards/margins_max": 0.521259069442749, "rewards/margins_min": -0.218729168176651, "rewards/margins_std": 0.3344251811504364, "rewards/rejected": -0.9115701913833618, "step": 370 }, { "epoch": 0.1, "grad_norm": 2.8511554584273577, "learning_rate": 4.9608355091383814e-06, "logits/chosen": -2.620173931121826, "logits/rejected": -2.49788236618042, "logps/chosen": -366.0997009277344, "logps/rejected": -384.23211669921875, "loss": 0.6425, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8439491391181946, "rewards/margins": 0.13820630311965942, "rewards/margins_max": 0.577583909034729, "rewards/margins_min": -0.3748566508293152, "rewards/margins_std": 0.4467763900756836, "rewards/rejected": -0.982155442237854, "step": 380 }, { "epoch": 0.1, "grad_norm": 3.378257027585911, "learning_rate": 4.9999488562447675e-06, "logits/chosen": -2.782338857650757, "logits/rejected": -2.735119342803955, "logps/chosen": -352.7326965332031, "logps/rejected": -353.9287109375, "loss": 0.6106, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6351458430290222, "rewards/margins": 0.3058081567287445, "rewards/margins_max": 0.7065421938896179, "rewards/margins_min": -0.051259350031614304, "rewards/margins_std": 0.3443611264228821, "rewards/rejected": -0.9409539103507996, "step": 390 }, { "epoch": 0.1, "grad_norm": 2.9087700531403127, "learning_rate": 4.999698361256577e-06, "logits/chosen": -2.601405143737793, "logits/rejected": -2.606778144836426, "logps/chosen": -310.2796325683594, "logps/rejected": -304.1072692871094, "loss": 0.6335, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5515845417976379, "rewards/margins": 0.13566093146800995, "rewards/margins_max": 0.6920101046562195, "rewards/margins_min": -0.35084354877471924, "rewards/margins_std": 0.4679028391838074, "rewards/rejected": -0.6872454285621643, "step": 400 }, { "epoch": 0.1, "eval_logits/chosen": -2.741288900375366, "eval_logits/rejected": -2.703533887863159, "eval_logps/chosen": -322.2904052734375, "eval_logps/rejected": -330.1777648925781, "eval_loss": 0.6075928807258606, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": -0.37803083658218384, "eval_rewards/margins": 0.3023185133934021, "eval_rewards/margins_max": 1.243634819984436, "eval_rewards/margins_min": -0.5587248802185059, "eval_rewards/margins_std": 0.5973000526428223, "eval_rewards/rejected": -0.6803492903709412, "eval_runtime": 223.7903, "eval_samples_per_second": 8.937, "eval_steps_per_second": 0.282, "step": 400 }, { "epoch": 0.11, "grad_norm": 3.5569389172087837, "learning_rate": 4.999239142174581e-06, "logits/chosen": -2.6941208839416504, "logits/rejected": -2.5748767852783203, "logps/chosen": -348.00152587890625, "logps/rejected": -348.72857666015625, "loss": 0.6088, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2868252992630005, "rewards/margins": 0.2597336173057556, "rewards/margins_max": 0.8000469207763672, "rewards/margins_min": -0.3382663130760193, "rewards/margins_std": 0.49483031034469604, "rewards/rejected": -0.5465589761734009, "step": 410 }, { "epoch": 0.11, "grad_norm": 7.181670636240666, "learning_rate": 4.99857123734344e-06, "logits/chosen": -2.6335091590881348, "logits/rejected": -2.620431423187256, "logps/chosen": -315.80718994140625, "logps/rejected": -319.80047607421875, "loss": 0.5759, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4420655369758606, "rewards/margins": 0.4245140552520752, "rewards/margins_max": 1.0392305850982666, "rewards/margins_min": -0.18339061737060547, "rewards/margins_std": 0.5364870429039001, "rewards/rejected": -0.8665796518325806, "step": 420 }, { "epoch": 0.11, "grad_norm": 8.115739915435931, "learning_rate": 4.997694702533016e-06, "logits/chosen": -2.6538214683532715, "logits/rejected": -2.5392677783966064, "logps/chosen": -318.4049377441406, "logps/rejected": -283.79937744140625, "loss": 0.5841, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.17481091618537903, "rewards/margins": 0.44675692915916443, "rewards/margins_max": 1.1195052862167358, "rewards/margins_min": -0.044527579098939896, "rewards/margins_std": 0.5284029245376587, "rewards/rejected": -0.6215678453445435, "step": 430 }, { "epoch": 0.12, "grad_norm": 5.011946330325816, "learning_rate": 4.996609610933713e-06, "logits/chosen": -2.6898980140686035, "logits/rejected": -2.6806278228759766, "logps/chosen": -310.4803771972656, "logps/rejected": -331.20758056640625, "loss": 0.5528, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.25639086961746216, "rewards/margins": 0.45017004013061523, "rewards/margins_max": 1.13104248046875, "rewards/margins_min": -0.10845420509576797, "rewards/margins_std": 0.5422754883766174, "rewards/rejected": -0.7065609097480774, "step": 440 }, { "epoch": 0.12, "grad_norm": 5.72056809171888, "learning_rate": 4.995316053150366e-06, "logits/chosen": -2.705965280532837, "logits/rejected": -2.656982183456421, "logps/chosen": -342.86444091796875, "logps/rejected": -317.276123046875, "loss": 0.5927, "rewards/accuracies": 0.625, "rewards/chosen": -0.5666399002075195, "rewards/margins": 0.26561588048934937, "rewards/margins_max": 1.2853853702545166, "rewards/margins_min": -0.5935114622116089, "rewards/margins_std": 0.8273480534553528, "rewards/rejected": -0.8322558403015137, "step": 450 }, { "epoch": 0.12, "grad_norm": 4.266329193676732, "learning_rate": 4.9938141371946815e-06, "logits/chosen": -2.467585325241089, "logits/rejected": -2.4676804542541504, "logps/chosen": -361.1650085449219, "logps/rejected": -401.39764404296875, "loss": 0.5747, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.096268892288208, "rewards/margins": 0.2714022994041443, "rewards/margins_max": 1.2240047454833984, "rewards/margins_min": -0.5820249915122986, "rewards/margins_std": 0.8145963549613953, "rewards/rejected": -1.367671251296997, "step": 460 }, { "epoch": 0.12, "grad_norm": 5.189727672569119, "learning_rate": 4.992103988476206e-06, "logits/chosen": -2.4503769874572754, "logits/rejected": -2.5443506240844727, "logps/chosen": -413.5054626464844, "logps/rejected": -440.5263671875, "loss": 0.603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2601969242095947, "rewards/margins": 0.3859338164329529, "rewards/margins_max": 1.1253424882888794, "rewards/margins_min": -0.40246525406837463, "rewards/margins_std": 0.721028745174408, "rewards/rejected": -1.6461305618286133, "step": 470 }, { "epoch": 0.13, "grad_norm": 6.710026529319383, "learning_rate": 4.990185749791866e-06, "logits/chosen": -2.5193355083465576, "logits/rejected": -2.462009906768799, "logps/chosen": -405.39312744140625, "logps/rejected": -408.20733642578125, "loss": 0.6264, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2991012334823608, "rewards/margins": 0.37089332938194275, "rewards/margins_max": 1.2720310688018799, "rewards/margins_min": -0.6764611005783081, "rewards/margins_std": 0.8861900568008423, "rewards/rejected": -1.669994592666626, "step": 480 }, { "epoch": 0.13, "grad_norm": 7.122191026010628, "learning_rate": 4.9880595813140395e-06, "logits/chosen": -2.469597578048706, "logits/rejected": -2.4734954833984375, "logps/chosen": -444.6744079589844, "logps/rejected": -490.56695556640625, "loss": 0.5817, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2124580144882202, "rewards/margins": 0.5220259428024292, "rewards/margins_max": 1.4727394580841064, "rewards/margins_min": -0.15542791783809662, "rewards/margins_std": 0.7436097264289856, "rewards/rejected": -1.734484076499939, "step": 490 }, { "epoch": 0.13, "grad_norm": 5.213289064835668, "learning_rate": 4.985725660577184e-06, "logits/chosen": -2.3712992668151855, "logits/rejected": -2.3561947345733643, "logps/chosen": -398.34503173828125, "logps/rejected": -393.5196533203125, "loss": 0.5664, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9887760877609253, "rewards/margins": 0.48274001479148865, "rewards/margins_max": 1.2894638776779175, "rewards/margins_min": -0.16468007862567902, "rewards/margins_std": 0.6644682884216309, "rewards/rejected": -1.4715161323547363, "step": 500 }, { "epoch": 0.13, "eval_logits/chosen": -2.3929760456085205, "eval_logits/rejected": -2.3569815158843994, "eval_logps/chosen": -389.6617126464844, "eval_logps/rejected": -424.1661682128906, "eval_loss": 0.5693480372428894, "eval_rewards/accuracies": 0.7222222089767456, "eval_rewards/chosen": -1.0517432689666748, "eval_rewards/margins": 0.568489670753479, "eval_rewards/margins_max": 2.149949789047241, "eval_rewards/margins_min": -0.8056244254112244, "eval_rewards/margins_std": 0.9737952947616577, "eval_rewards/rejected": -1.6202330589294434, "eval_runtime": 223.8303, "eval_samples_per_second": 8.935, "eval_steps_per_second": 0.281, "step": 500 }, { "epoch": 0.13, "grad_norm": 4.442250462587998, "learning_rate": 4.983184182463009e-06, "logits/chosen": -2.1839969158172607, "logits/rejected": -2.249932289123535, "logps/chosen": -411.88909912109375, "logps/rejected": -418.4384765625, "loss": 0.5415, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1960546970367432, "rewards/margins": 0.45294269919395447, "rewards/margins_max": 1.5394253730773926, "rewards/margins_min": -0.5983884334564209, "rewards/margins_std": 0.9396608471870422, "rewards/rejected": -1.6489975452423096, "step": 510 }, { "epoch": 0.14, "grad_norm": 7.132241835229047, "learning_rate": 4.980435359184203e-06, "logits/chosen": -2.2927334308624268, "logits/rejected": -2.222160816192627, "logps/chosen": -398.7985534667969, "logps/rejected": -428.216552734375, "loss": 0.5823, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2454442977905273, "rewards/margins": 0.7350202798843384, "rewards/margins_max": 1.7782729864120483, "rewards/margins_min": -0.1742320954799652, "rewards/margins_std": 0.858984649181366, "rewards/rejected": -1.9804645776748657, "step": 520 }, { "epoch": 0.14, "grad_norm": 6.437852275801987, "learning_rate": 4.9774794202667236e-06, "logits/chosen": -2.4473366737365723, "logits/rejected": -2.4344115257263184, "logps/chosen": -412.31085205078125, "logps/rejected": -418.19647216796875, "loss": 0.5902, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8551918268203735, "rewards/margins": 0.4042881429195404, "rewards/margins_max": 1.4064571857452393, "rewards/margins_min": -0.3519786596298218, "rewards/margins_std": 0.7998281121253967, "rewards/rejected": -1.2594798803329468, "step": 530 }, { "epoch": 0.14, "grad_norm": 6.987568852557969, "learning_rate": 4.974316612530615e-06, "logits/chosen": -2.42439341545105, "logits/rejected": -2.4253077507019043, "logps/chosen": -394.7391357421875, "logps/rejected": -348.72528076171875, "loss": 0.5896, "rewards/accuracies": 0.5, "rewards/chosen": -1.0580438375473022, "rewards/margins": -0.08111296594142914, "rewards/margins_max": 0.9167228937149048, "rewards/margins_min": -1.1573469638824463, "rewards/margins_std": 0.9216461181640625, "rewards/rejected": -0.9769307971000671, "step": 540 }, { "epoch": 0.14, "grad_norm": 5.49630266238523, "learning_rate": 4.970947200069416e-06, "logits/chosen": -2.513308048248291, "logits/rejected": -2.4769692420959473, "logps/chosen": -368.7348327636719, "logps/rejected": -409.03131103515625, "loss": 0.572, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7056823968887329, "rewards/margins": 0.7211836576461792, "rewards/margins_max": 1.774468183517456, "rewards/margins_min": -0.28661051392555237, "rewards/margins_std": 0.8945043683052063, "rewards/rejected": -1.4268661737442017, "step": 550 }, { "epoch": 0.15, "grad_norm": 7.709949001039684, "learning_rate": 4.967371464228096e-06, "logits/chosen": -2.4540233612060547, "logits/rejected": -2.5126001834869385, "logps/chosen": -390.78582763671875, "logps/rejected": -394.2521667480469, "loss": 0.6125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9085943102836609, "rewards/margins": 0.38423341512680054, "rewards/margins_max": 1.3117491006851196, "rewards/margins_min": -0.7507814168930054, "rewards/margins_std": 0.962700366973877, "rewards/rejected": -1.2928277254104614, "step": 560 }, { "epoch": 0.15, "grad_norm": 6.234051000450727, "learning_rate": 4.963589703579569e-06, "logits/chosen": -2.384986400604248, "logits/rejected": -2.3592116832733154, "logps/chosen": -357.03466796875, "logps/rejected": -375.25152587890625, "loss": 0.5335, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8392063975334167, "rewards/margins": 0.628653347492218, "rewards/margins_max": 1.5715243816375732, "rewards/margins_min": -0.1336989402770996, "rewards/margins_std": 0.7848575115203857, "rewards/rejected": -1.4678598642349243, "step": 570 }, { "epoch": 0.15, "grad_norm": 6.09313053992866, "learning_rate": 4.9596022338997615e-06, "logits/chosen": -2.447239875793457, "logits/rejected": -2.4373862743377686, "logps/chosen": -456.0433654785156, "logps/rejected": -435.745361328125, "loss": 0.5866, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8575530052185059, "rewards/margins": 0.8354150652885437, "rewards/margins_max": 1.6716899871826172, "rewards/margins_min": 0.11878925561904907, "rewards/margins_std": 0.7200655937194824, "rewards/rejected": -1.6929680109024048, "step": 580 }, { "epoch": 0.15, "grad_norm": 4.140036256788041, "learning_rate": 4.955409388141243e-06, "logits/chosen": -2.3816285133361816, "logits/rejected": -2.3704938888549805, "logps/chosen": -359.50140380859375, "logps/rejected": -383.23297119140625, "loss": 0.5163, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0055649280548096, "rewards/margins": 0.4314577579498291, "rewards/margins_max": 1.4042364358901978, "rewards/margins_min": -0.5469448566436768, "rewards/margins_std": 0.8515745997428894, "rewards/rejected": -1.4370226860046387, "step": 590 }, { "epoch": 0.16, "grad_norm": 7.65657833718797, "learning_rate": 4.951011516405429e-06, "logits/chosen": -2.463222026824951, "logits/rejected": -2.460261344909668, "logps/chosen": -495.2791442871094, "logps/rejected": -459.95233154296875, "loss": 0.5428, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1409215927124023, "rewards/margins": 0.42876744270324707, "rewards/margins_max": 1.6242196559906006, "rewards/margins_min": -0.5998972058296204, "rewards/margins_std": 1.0001561641693115, "rewards/rejected": -1.5696890354156494, "step": 600 }, { "epoch": 0.16, "eval_logits/chosen": -2.3340189456939697, "eval_logits/rejected": -2.3086748123168945, "eval_logps/chosen": -397.9946594238281, "eval_logps/rejected": -444.65264892578125, "eval_loss": 0.5504034161567688, "eval_rewards/accuracies": 0.7460317611694336, "eval_rewards/chosen": -1.1350733041763306, "eval_rewards/margins": 0.6900246739387512, "eval_rewards/margins_max": 2.52213191986084, "eval_rewards/margins_min": -0.8418879508972168, "eval_rewards/margins_std": 1.1085470914840698, "eval_rewards/rejected": -1.825097918510437, "eval_runtime": 223.7265, "eval_samples_per_second": 8.939, "eval_steps_per_second": 0.282, "step": 600 }, { "epoch": 0.16, "grad_norm": 7.120673221092373, "learning_rate": 4.946408985913344e-06, "logits/chosen": -2.2198569774627686, "logits/rejected": -2.2471392154693604, "logps/chosen": -460.6009826660156, "logps/rejected": -423.7510681152344, "loss": 0.5543, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1688897609710693, "rewards/margins": 0.3595579266548157, "rewards/margins_max": 1.338615894317627, "rewards/margins_min": -0.579300582408905, "rewards/margins_std": 0.8541596531867981, "rewards/rejected": -1.5284477472305298, "step": 610 }, { "epoch": 0.16, "grad_norm": 7.42399692117693, "learning_rate": 4.941602180974958e-06, "logits/chosen": -2.2426674365997314, "logits/rejected": -2.2062366008758545, "logps/chosen": -407.9687194824219, "logps/rejected": -440.90570068359375, "loss": 0.5276, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.416456937789917, "rewards/margins": 0.6837646961212158, "rewards/margins_max": 2.1622653007507324, "rewards/margins_min": -0.33323535323143005, "rewards/margins_std": 1.151943564414978, "rewards/rejected": -2.1002213954925537, "step": 620 }, { "epoch": 0.16, "grad_norm": 6.561467552713531, "learning_rate": 4.936591502957101e-06, "logits/chosen": -1.9268481731414795, "logits/rejected": -1.9559924602508545, "logps/chosen": -451.932373046875, "logps/rejected": -514.363525390625, "loss": 0.5514, "rewards/accuracies": 0.75, "rewards/chosen": -1.5733277797698975, "rewards/margins": 0.8057867288589478, "rewards/margins_max": 2.2768046855926514, "rewards/margins_min": -0.4499366879463196, "rewards/margins_std": 1.2055299282073975, "rewards/rejected": -2.3791141510009766, "step": 630 }, { "epoch": 0.17, "grad_norm": 17.670175647736922, "learning_rate": 4.931377370249946e-06, "logits/chosen": -1.904213547706604, "logits/rejected": -1.8482478857040405, "logps/chosen": -551.0469360351562, "logps/rejected": -547.3648071289062, "loss": 0.5212, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0570592880249023, "rewards/margins": 0.8177971839904785, "rewards/margins_max": 2.221104145050049, "rewards/margins_min": -0.2894425690174103, "rewards/margins_std": 1.1095309257507324, "rewards/rejected": -2.87485671043396, "step": 640 }, { "epoch": 0.17, "grad_norm": 7.503253485402912, "learning_rate": 4.925960218232073e-06, "logits/chosen": -1.85946524143219, "logits/rejected": -1.8567167520523071, "logps/chosen": -486.20599365234375, "logps/rejected": -563.3365478515625, "loss": 0.5424, "rewards/accuracies": 0.75, "rewards/chosen": -2.075512409210205, "rewards/margins": 0.7706689834594727, "rewards/margins_max": 2.0721142292022705, "rewards/margins_min": -0.2392055094242096, "rewards/margins_std": 1.013484001159668, "rewards/rejected": -2.846181631088257, "step": 650 }, { "epoch": 0.17, "grad_norm": 11.026741814070904, "learning_rate": 4.920340499234116e-06, "logits/chosen": -1.9600328207015991, "logits/rejected": -1.9871832132339478, "logps/chosen": -332.79132080078125, "logps/rejected": -424.4923400878906, "loss": 0.5617, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9257528185844421, "rewards/margins": 0.7792068123817444, "rewards/margins_max": 2.2150683403015137, "rewards/margins_min": -0.6894339323043823, "rewards/margins_std": 1.2881157398223877, "rewards/rejected": -1.704959511756897, "step": 660 }, { "epoch": 0.18, "grad_norm": 5.859894833711455, "learning_rate": 4.914518682500995e-06, "logits/chosen": -2.002422332763672, "logits/rejected": -1.9537159204483032, "logps/chosen": -342.4452819824219, "logps/rejected": -405.4795837402344, "loss": 0.5544, "rewards/accuracies": 0.75, "rewards/chosen": -0.9491150975227356, "rewards/margins": 0.6390417814254761, "rewards/margins_max": 2.1160590648651123, "rewards/margins_min": -0.4366641640663147, "rewards/margins_std": 1.1581398248672485, "rewards/rejected": -1.5881569385528564, "step": 670 }, { "epoch": 0.18, "grad_norm": 5.9321065398801025, "learning_rate": 4.9084952541527315e-06, "logits/chosen": -1.8548431396484375, "logits/rejected": -1.8357175588607788, "logps/chosen": -475.42364501953125, "logps/rejected": -524.4793090820312, "loss": 0.522, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4602549076080322, "rewards/margins": 0.878754734992981, "rewards/margins_max": 2.140374183654785, "rewards/margins_min": -0.38484567403793335, "rewards/margins_std": 1.1169432401657104, "rewards/rejected": -2.3390095233917236, "step": 680 }, { "epoch": 0.18, "grad_norm": 7.914663549948711, "learning_rate": 4.902270717143858e-06, "logits/chosen": -1.8159208297729492, "logits/rejected": -1.784799337387085, "logps/chosen": -529.4669189453125, "logps/rejected": -559.7930908203125, "loss": 0.5321, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.4507896900177, "rewards/margins": 0.7450770139694214, "rewards/margins_max": 1.9555778503417969, "rewards/margins_min": -0.48074856400489807, "rewards/margins_std": 1.096519112586975, "rewards/rejected": -3.195866823196411, "step": 690 }, { "epoch": 0.18, "grad_norm": 6.446240463501284, "learning_rate": 4.895845591221427e-06, "logits/chosen": -1.9118859767913818, "logits/rejected": -1.8599398136138916, "logps/chosen": -506.2217712402344, "logps/rejected": -551.4000244140625, "loss": 0.5696, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.339138984680176, "rewards/margins": 0.5787423849105835, "rewards/margins_max": 2.197732448577881, "rewards/margins_min": -1.0114357471466064, "rewards/margins_std": 1.4246641397476196, "rewards/rejected": -2.917881488800049, "step": 700 }, { "epoch": 0.18, "eval_logits/chosen": -2.0543744564056396, "eval_logits/rejected": -2.0233211517333984, "eval_logps/chosen": -445.20770263671875, "eval_logps/rejected": -491.5914306640625, "eval_loss": 0.5407054424285889, "eval_rewards/accuracies": 0.7301587462425232, "eval_rewards/chosen": -1.6072039604187012, "eval_rewards/margins": 0.6872818470001221, "eval_rewards/margins_max": 2.3967599868774414, "eval_rewards/margins_min": -0.8007528185844421, "eval_rewards/margins_std": 1.059094786643982, "eval_rewards/rejected": -2.2944858074188232, "eval_runtime": 223.8172, "eval_samples_per_second": 8.936, "eval_steps_per_second": 0.281, "step": 700 }, { "epoch": 0.19, "grad_norm": 7.857436682421419, "learning_rate": 4.8892204128816e-06, "logits/chosen": -2.032893657684326, "logits/rejected": -2.038665533065796, "logps/chosen": -377.6850280761719, "logps/rejected": -361.36993408203125, "loss": 0.5658, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.5392578840255737, "rewards/margins": 0.18441717326641083, "rewards/margins_max": 1.1020867824554443, "rewards/margins_min": -0.5870569348335266, "rewards/margins_std": 0.7355962991714478, "rewards/rejected": -1.723675012588501, "step": 710 }, { "epoch": 0.19, "grad_norm": 10.972140318059175, "learning_rate": 4.882395735324864e-06, "logits/chosen": -1.9157111644744873, "logits/rejected": -1.8870277404785156, "logps/chosen": -475.08624267578125, "logps/rejected": -485.651611328125, "loss": 0.4808, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.058393955230713, "rewards/margins": 0.773791491985321, "rewards/margins_max": 1.7518593072891235, "rewards/margins_min": -0.2601913511753082, "rewards/margins_std": 0.8955210447311401, "rewards/rejected": -1.8321853876113892, "step": 720 }, { "epoch": 0.19, "grad_norm": 7.178842654659006, "learning_rate": 4.87537212840983e-06, "logits/chosen": -1.9146709442138672, "logits/rejected": -1.867175817489624, "logps/chosen": -459.81976318359375, "logps/rejected": -489.9925842285156, "loss": 0.4842, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4382277727127075, "rewards/margins": 0.9112623333930969, "rewards/margins_max": 2.0528979301452637, "rewards/margins_min": -0.26927846670150757, "rewards/margins_std": 1.022033452987671, "rewards/rejected": -2.34948992729187, "step": 730 }, { "epoch": 0.19, "grad_norm": 8.292513805673474, "learning_rate": 4.8681501786056545e-06, "logits/chosen": -1.8258020877838135, "logits/rejected": -1.821004867553711, "logps/chosen": -432.7748107910156, "logps/rejected": -599.3067626953125, "loss": 0.5598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.642817497253418, "rewards/margins": 1.0478721857070923, "rewards/margins_max": 2.9242184162139893, "rewards/margins_min": -0.39921122789382935, "rewards/margins_std": 1.5025067329406738, "rewards/rejected": -2.6906895637512207, "step": 740 }, { "epoch": 0.2, "grad_norm": 7.879975941810093, "learning_rate": 4.860730488943068e-06, "logits/chosen": -1.6117355823516846, "logits/rejected": -1.7028987407684326, "logps/chosen": -431.72467041015625, "logps/rejected": -509.7525329589844, "loss": 0.5617, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7846498489379883, "rewards/margins": 0.6251150369644165, "rewards/margins_max": 1.878993272781372, "rewards/margins_min": -0.5090195536613464, "rewards/margins_std": 1.0578628778457642, "rewards/rejected": -2.4097647666931152, "step": 750 }, { "epoch": 0.2, "grad_norm": 5.091557504802144, "learning_rate": 4.853113678964022e-06, "logits/chosen": -1.8858792781829834, "logits/rejected": -1.824402093887329, "logps/chosen": -407.0565185546875, "logps/rejected": -442.83282470703125, "loss": 0.477, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.094581961631775, "rewards/margins": 0.9017997980117798, "rewards/margins_max": 2.238558053970337, "rewards/margins_min": -0.08495251089334488, "rewards/margins_std": 1.0317590236663818, "rewards/rejected": -1.9963815212249756, "step": 760 }, { "epoch": 0.2, "grad_norm": 19.507208641034314, "learning_rate": 4.845300384669958e-06, "logits/chosen": -1.8328478336334229, "logits/rejected": -1.800824522972107, "logps/chosen": -455.3678283691406, "logps/rejected": -509.86370849609375, "loss": 0.5057, "rewards/accuracies": 0.75, "rewards/chosen": -1.4060834646224976, "rewards/margins": 0.975176215171814, "rewards/margins_max": 2.5600852966308594, "rewards/margins_min": -0.27949485182762146, "rewards/margins_std": 1.2581160068511963, "rewards/rejected": -2.3812596797943115, "step": 770 }, { "epoch": 0.2, "grad_norm": 15.221614246958275, "learning_rate": 4.837291258468701e-06, "logits/chosen": -2.0353169441223145, "logits/rejected": -1.9767030477523804, "logps/chosen": -450.42828369140625, "logps/rejected": -504.524169921875, "loss": 0.4792, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3853620290756226, "rewards/margins": 0.8491100072860718, "rewards/margins_max": 2.4746193885803223, "rewards/margins_min": -0.661260724067688, "rewards/margins_std": 1.3681740760803223, "rewards/rejected": -2.2344717979431152, "step": 780 }, { "epoch": 0.21, "grad_norm": 8.670857907696837, "learning_rate": 4.829086969119984e-06, "logits/chosen": -2.1320652961730957, "logits/rejected": -2.082036018371582, "logps/chosen": -425.1429138183594, "logps/rejected": -452.5391540527344, "loss": 0.5355, "rewards/accuracies": 0.625, "rewards/chosen": -1.363497018814087, "rewards/margins": 0.49704885482788086, "rewards/margins_max": 1.7286510467529297, "rewards/margins_min": -0.7384086847305298, "rewards/margins_std": 1.1164295673370361, "rewards/rejected": -1.8605457544326782, "step": 790 }, { "epoch": 0.21, "grad_norm": 7.833513244294361, "learning_rate": 4.820688201679605e-06, "logits/chosen": -2.104609251022339, "logits/rejected": -2.0922112464904785, "logps/chosen": -467.77166748046875, "logps/rejected": -486.08587646484375, "loss": 0.4864, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1516042947769165, "rewards/margins": 1.1946066617965698, "rewards/margins_max": 2.9842381477355957, "rewards/margins_min": -0.4471331536769867, "rewards/margins_std": 1.4880039691925049, "rewards/rejected": -2.3462109565734863, "step": 800 }, { "epoch": 0.21, "eval_logits/chosen": -2.143479108810425, "eval_logits/rejected": -2.1125800609588623, "eval_logps/chosen": -432.715087890625, "eval_logps/rejected": -500.2979431152344, "eval_loss": 0.5377450585365295, "eval_rewards/accuracies": 0.738095223903656, "eval_rewards/chosen": -1.4822779893875122, "eval_rewards/margins": 0.8992730379104614, "eval_rewards/margins_max": 2.986936569213867, "eval_rewards/margins_min": -0.9704049825668335, "eval_rewards/margins_std": 1.3290927410125732, "eval_rewards/rejected": -2.3815507888793945, "eval_runtime": 223.7876, "eval_samples_per_second": 8.937, "eval_steps_per_second": 0.282, "step": 800 }, { "epoch": 0.21, "grad_norm": 16.136652085465055, "learning_rate": 4.8120956574422315e-06, "logits/chosen": -2.196211338043213, "logits/rejected": -2.1339006423950195, "logps/chosen": -469.78350830078125, "logps/rejected": -525.05517578125, "loss": 0.5077, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.614763617515564, "rewards/margins": 0.9444462060928345, "rewards/margins_max": 2.339083194732666, "rewards/margins_min": -0.48020973801612854, "rewards/margins_std": 1.2474253177642822, "rewards/rejected": -2.5592098236083984, "step": 810 }, { "epoch": 0.21, "grad_norm": 11.285292136383339, "learning_rate": 4.803310053882831e-06, "logits/chosen": -1.9518890380859375, "logits/rejected": -1.872637152671814, "logps/chosen": -445.0243225097656, "logps/rejected": -500.734375, "loss": 0.5505, "rewards/accuracies": 0.75, "rewards/chosen": -1.7508691549301147, "rewards/margins": 1.144101619720459, "rewards/margins_max": 2.68922758102417, "rewards/margins_min": -0.1493835300207138, "rewards/margins_std": 1.3218233585357666, "rewards/rejected": -2.894970417022705, "step": 820 }, { "epoch": 0.22, "grad_norm": 7.149898725739413, "learning_rate": 4.794332124596775e-06, "logits/chosen": -1.9895038604736328, "logits/rejected": -1.9935522079467773, "logps/chosen": -490.7037658691406, "logps/rejected": -535.5740966796875, "loss": 0.5937, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6808383464813232, "rewards/margins": 0.6647806167602539, "rewards/margins_max": 2.3293213844299316, "rewards/margins_min": -0.7533954381942749, "rewards/margins_std": 1.350891351699829, "rewards/rejected": -2.345618963241577, "step": 830 }, { "epoch": 0.22, "grad_norm": 8.087613994378707, "learning_rate": 4.785162619238575e-06, "logits/chosen": -2.117966651916504, "logits/rejected": -2.1223068237304688, "logps/chosen": -347.86956787109375, "logps/rejected": -349.9862976074219, "loss": 0.5682, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9266284704208374, "rewards/margins": 0.4781726002693176, "rewards/margins_max": 1.6303083896636963, "rewards/margins_min": -0.5450159907341003, "rewards/margins_std": 0.9929227828979492, "rewards/rejected": -1.4048011302947998, "step": 840 }, { "epoch": 0.22, "grad_norm": 8.21784208091132, "learning_rate": 4.775802303459288e-06, "logits/chosen": -2.2097675800323486, "logits/rejected": -2.153590440750122, "logps/chosen": -337.40789794921875, "logps/rejected": -406.9679870605469, "loss": 0.4941, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0512070655822754, "rewards/margins": 0.6299057006835938, "rewards/margins_max": 1.739553689956665, "rewards/margins_min": -0.17973919212818146, "rewards/margins_std": 0.852813720703125, "rewards/rejected": -1.6811126470565796, "step": 850 }, { "epoch": 0.23, "grad_norm": 12.19288117379807, "learning_rate": 4.766251958842589e-06, "logits/chosen": -2.1003265380859375, "logits/rejected": -2.1681132316589355, "logps/chosen": -271.9151306152344, "logps/rejected": -406.7418212890625, "loss": 0.5377, "rewards/accuracies": 0.625, "rewards/chosen": -1.2329720258712769, "rewards/margins": 0.7982263565063477, "rewards/margins_max": 2.596799850463867, "rewards/margins_min": -0.4915478229522705, "rewards/margins_std": 1.3920280933380127, "rewards/rejected": -2.031198501586914, "step": 860 }, { "epoch": 0.23, "grad_norm": 11.131314318051828, "learning_rate": 4.7565123828395066e-06, "logits/chosen": -2.061199426651001, "logits/rejected": -2.1265981197357178, "logps/chosen": -500.68804931640625, "logps/rejected": -522.8644409179688, "loss": 0.5464, "rewards/accuracies": 0.75, "rewards/chosen": -1.9369293451309204, "rewards/margins": 0.9344605207443237, "rewards/margins_max": 2.2368855476379395, "rewards/margins_min": -0.5030005574226379, "rewards/margins_std": 1.2202907800674438, "rewards/rejected": -2.871389865875244, "step": 870 }, { "epoch": 0.23, "grad_norm": 7.771539105502807, "learning_rate": 4.746584388701831e-06, "logits/chosen": -2.3022379875183105, "logits/rejected": -2.266432285308838, "logps/chosen": -422.40673828125, "logps/rejected": -526.6629638671875, "loss": 0.6014, "rewards/accuracies": 0.625, "rewards/chosen": -1.5710153579711914, "rewards/margins": 0.5998293161392212, "rewards/margins_max": 1.941954255104065, "rewards/margins_min": -0.8799946904182434, "rewards/margins_std": 1.2202316522598267, "rewards/rejected": -2.170844793319702, "step": 880 }, { "epoch": 0.23, "grad_norm": 6.442016794708065, "learning_rate": 4.736468805414218e-06, "logits/chosen": -2.272817611694336, "logits/rejected": -2.247520923614502, "logps/chosen": -454.64678955078125, "logps/rejected": -462.1295471191406, "loss": 0.5154, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.502676248550415, "rewards/margins": 0.6041428446769714, "rewards/margins_max": 1.3871121406555176, "rewards/margins_min": -0.07614180445671082, "rewards/margins_std": 0.6647244691848755, "rewards/rejected": -2.1068193912506104, "step": 890 }, { "epoch": 0.24, "grad_norm": 10.54006797117144, "learning_rate": 4.7261664776249595e-06, "logits/chosen": -2.1258766651153564, "logits/rejected": -2.156111478805542, "logps/chosen": -428.930908203125, "logps/rejected": -539.5088500976562, "loss": 0.542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9926055669784546, "rewards/margins": 0.6907774806022644, "rewards/margins_max": 2.3944778442382812, "rewards/margins_min": -0.7125498652458191, "rewards/margins_std": 1.3945897817611694, "rewards/rejected": -2.6833832263946533, "step": 900 }, { "epoch": 0.24, "eval_logits/chosen": -2.2023584842681885, "eval_logits/rejected": -2.1744332313537598, "eval_logps/chosen": -483.36138916015625, "eval_logps/rejected": -551.626220703125, "eval_loss": 0.5398516654968262, "eval_rewards/accuracies": 0.7301587462425232, "eval_rewards/chosen": -1.98874032497406, "eval_rewards/margins": 0.906093180179596, "eval_rewards/margins_max": 3.166734457015991, "eval_rewards/margins_min": -0.9489678144454956, "eval_rewards/margins_std": 1.3690062761306763, "eval_rewards/rejected": -2.89483380317688, "eval_runtime": 223.7868, "eval_samples_per_second": 8.937, "eval_steps_per_second": 0.282, "step": 900 }, { "epoch": 0.24, "grad_norm": 10.538147430622294, "learning_rate": 4.715678265575463e-06, "logits/chosen": -2.1057233810424805, "logits/rejected": -2.096747875213623, "logps/chosen": -459.4873046875, "logps/rejected": -539.7784423828125, "loss": 0.5158, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9052470922470093, "rewards/margins": 1.0579489469528198, "rewards/margins_max": 2.5211856365203857, "rewards/margins_min": -0.396616131067276, "rewards/margins_std": 1.2667295932769775, "rewards/rejected": -2.96319580078125, "step": 910 }, { "epoch": 0.24, "grad_norm": 7.182950113906346, "learning_rate": 4.705005045028415e-06, "logits/chosen": -2.2134530544281006, "logits/rejected": -2.143956422805786, "logps/chosen": -443.46527099609375, "logps/rejected": -486.8726501464844, "loss": 0.5519, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5311297178268433, "rewards/margins": 0.8069475293159485, "rewards/margins_max": 2.879056453704834, "rewards/margins_min": -0.7901986837387085, "rewards/margins_std": 1.6600391864776611, "rewards/rejected": -2.3380773067474365, "step": 920 }, { "epoch": 0.24, "grad_norm": 4.932091950264043, "learning_rate": 4.694147707194659e-06, "logits/chosen": -2.3232321739196777, "logits/rejected": -2.271859645843506, "logps/chosen": -437.90313720703125, "logps/rejected": -524.9970703125, "loss": 0.472, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0345693826675415, "rewards/margins": 1.2751846313476562, "rewards/margins_max": 2.851594924926758, "rewards/margins_min": -0.3139026463031769, "rewards/margins_std": 1.4101760387420654, "rewards/rejected": -2.3097543716430664, "step": 930 }, { "epoch": 0.25, "grad_norm": 8.14080587760905, "learning_rate": 4.683107158658782e-06, "logits/chosen": -2.381969928741455, "logits/rejected": -2.3919167518615723, "logps/chosen": -396.89178466796875, "logps/rejected": -435.490478515625, "loss": 0.4594, "rewards/accuracies": 0.75, "rewards/chosen": -0.8891837000846863, "rewards/margins": 0.6818939447402954, "rewards/margins_max": 2.222177743911743, "rewards/margins_min": -0.37940576672554016, "rewards/margins_std": 1.1597857475280762, "rewards/rejected": -1.571077823638916, "step": 940 }, { "epoch": 0.25, "grad_norm": 11.809089289346153, "learning_rate": 4.671884321303407e-06, "logits/chosen": -2.326172351837158, "logits/rejected": -2.3508529663085938, "logps/chosen": -374.9980773925781, "logps/rejected": -508.20977783203125, "loss": 0.5276, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0149739980697632, "rewards/margins": 0.9852820634841919, "rewards/margins_max": 2.872986316680908, "rewards/margins_min": -0.649988055229187, "rewards/margins_std": 1.5814167261123657, "rewards/rejected": -2.000255823135376, "step": 950 }, { "epoch": 0.25, "grad_norm": 6.4232372314886845, "learning_rate": 4.660480132232224e-06, "logits/chosen": -2.3392229080200195, "logits/rejected": -2.231630802154541, "logps/chosen": -514.9369506835938, "logps/rejected": -507.2276306152344, "loss": 0.5617, "rewards/accuracies": 0.625, "rewards/chosen": -1.448197603225708, "rewards/margins": 0.6242163777351379, "rewards/margins_max": 2.208193778991699, "rewards/margins_min": -0.6682778596878052, "rewards/margins_std": 1.3060455322265625, "rewards/rejected": -2.072413921356201, "step": 960 }, { "epoch": 0.25, "grad_norm": 8.687449064036828, "learning_rate": 4.6488955436917414e-06, "logits/chosen": -2.2683069705963135, "logits/rejected": -2.264864206314087, "logps/chosen": -489.6954040527344, "logps/rejected": -557.469970703125, "loss": 0.5207, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.588444709777832, "rewards/margins": 1.1935683488845825, "rewards/margins_max": 2.5073862075805664, "rewards/margins_min": -0.2541956603527069, "rewards/margins_std": 1.285449743270874, "rewards/rejected": -2.782012939453125, "step": 970 }, { "epoch": 0.26, "grad_norm": 8.607862613163029, "learning_rate": 4.6371315229917644e-06, "logits/chosen": -2.295264959335327, "logits/rejected": -2.303008556365967, "logps/chosen": -473.395263671875, "logps/rejected": -509.66778564453125, "loss": 0.5044, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6738195419311523, "rewards/margins": 1.0031205415725708, "rewards/margins_max": 2.3547239303588867, "rewards/margins_min": -0.3173818588256836, "rewards/margins_std": 1.2006018161773682, "rewards/rejected": -2.6769399642944336, "step": 980 }, { "epoch": 0.26, "grad_norm": 8.970531113975184, "learning_rate": 4.625189052424638e-06, "logits/chosen": -2.1793088912963867, "logits/rejected": -2.187732458114624, "logps/chosen": -508.7928771972656, "logps/rejected": -556.1360473632812, "loss": 0.4816, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6925113201141357, "rewards/margins": 0.9570150375366211, "rewards/margins_max": 2.130171775817871, "rewards/margins_min": -0.21927733719348907, "rewards/margins_std": 1.0393879413604736, "rewards/rejected": -2.649526596069336, "step": 990 }, { "epoch": 0.26, "grad_norm": 7.392176136692677, "learning_rate": 4.613069129183218e-06, "logits/chosen": -2.0926525592803955, "logits/rejected": -2.059450149536133, "logps/chosen": -479.4851989746094, "logps/rejected": -511.2705078125, "loss": 0.5518, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.421639919281006, "rewards/margins": 0.562799870967865, "rewards/margins_max": 1.65924072265625, "rewards/margins_min": -0.4800806939601898, "rewards/margins_std": 0.9537714719772339, "rewards/rejected": -2.984440326690674, "step": 1000 }, { "epoch": 0.26, "eval_logits/chosen": -2.1708016395568848, "eval_logits/rejected": -2.137993574142456, "eval_logps/chosen": -478.76190185546875, "eval_logps/rejected": -547.7310180664062, "eval_loss": 0.5299688577651978, "eval_rewards/accuracies": 0.7539682388305664, "eval_rewards/chosen": -1.9427460432052612, "eval_rewards/margins": 0.913135826587677, "eval_rewards/margins_max": 3.1137185096740723, "eval_rewards/margins_min": -0.9029161334037781, "eval_rewards/margins_std": 1.3265271186828613, "eval_rewards/rejected": -2.855881690979004, "eval_runtime": 223.8774, "eval_samples_per_second": 8.933, "eval_steps_per_second": 0.281, "step": 1000 }, { "epoch": 0.26, "grad_norm": 9.701738685437153, "learning_rate": 4.600772765277607e-06, "logits/chosen": -2.155801773071289, "logits/rejected": -2.0896427631378174, "logps/chosen": -465.2852478027344, "logps/rejected": -558.8109741210938, "loss": 0.5169, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.825250267982483, "rewards/margins": 0.9694963693618774, "rewards/margins_max": 2.520638942718506, "rewards/margins_min": -0.26592451333999634, "rewards/margins_std": 1.2553303241729736, "rewards/rejected": -2.7947466373443604, "step": 1010 }, { "epoch": 0.27, "grad_norm": 7.737448834818589, "learning_rate": 4.588300987450652e-06, "logits/chosen": -2.179655075073242, "logits/rejected": -2.065035820007324, "logps/chosen": -438.249267578125, "logps/rejected": -521.0628662109375, "loss": 0.5053, "rewards/accuracies": 0.625, "rewards/chosen": -1.715210199356079, "rewards/margins": 0.7605463266372681, "rewards/margins_max": 2.4746925830841064, "rewards/margins_min": -0.5442928075790405, "rewards/margins_std": 1.3775581121444702, "rewards/rejected": -2.4757561683654785, "step": 1020 }, { "epoch": 0.27, "grad_norm": 7.892404850315844, "learning_rate": 4.5756548370922136e-06, "logits/chosen": -2.1576123237609863, "logits/rejected": -2.173241138458252, "logps/chosen": -452.14544677734375, "logps/rejected": -530.881591796875, "loss": 0.5123, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5204532146453857, "rewards/margins": 0.8559392690658569, "rewards/margins_max": 2.878176212310791, "rewards/margins_min": -0.4689345359802246, "rewards/margins_std": 1.5012952089309692, "rewards/rejected": -2.3763926029205322, "step": 1030 }, { "epoch": 0.27, "grad_norm": 8.214942659443778, "learning_rate": 4.562835370152206e-06, "logits/chosen": -2.0492420196533203, "logits/rejected": -2.029965877532959, "logps/chosen": -424.9143981933594, "logps/rejected": -472.5741271972656, "loss": 0.5591, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2679744958877563, "rewards/margins": 0.8711051940917969, "rewards/margins_max": 2.3786683082580566, "rewards/margins_min": -0.42061057686805725, "rewards/margins_std": 1.2821840047836304, "rewards/rejected": -2.1390795707702637, "step": 1040 }, { "epoch": 0.27, "grad_norm": 8.929366672522933, "learning_rate": 4.54984365705243e-06, "logits/chosen": -2.187072992324829, "logits/rejected": -2.1421728134155273, "logps/chosen": -447.60174560546875, "logps/rejected": -477.4373474121094, "loss": 0.5012, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2021777629852295, "rewards/margins": 0.6009781956672668, "rewards/margins_max": 1.9804699420928955, "rewards/margins_min": -0.5928093791007996, "rewards/margins_std": 1.178869366645813, "rewards/rejected": -1.8031558990478516, "step": 1050 }, { "epoch": 0.28, "grad_norm": 8.051533656010337, "learning_rate": 4.536680782597191e-06, "logits/chosen": -2.1374475955963135, "logits/rejected": -2.0453429222106934, "logps/chosen": -508.3507385253906, "logps/rejected": -595.1114501953125, "loss": 0.4479, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.544737458229065, "rewards/margins": 1.016281008720398, "rewards/margins_max": 2.32208514213562, "rewards/margins_min": -0.043042730540037155, "rewards/margins_std": 1.0638062953948975, "rewards/rejected": -2.561018466949463, "step": 1060 }, { "epoch": 0.28, "grad_norm": 13.014865298957188, "learning_rate": 4.523347845882718e-06, "logits/chosen": -2.0794129371643066, "logits/rejected": -2.057588815689087, "logps/chosen": -429.98046875, "logps/rejected": -553.5656127929688, "loss": 0.5455, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7069686651229858, "rewards/margins": 1.336508870124817, "rewards/margins_max": 3.380413055419922, "rewards/margins_min": -0.5239975452423096, "rewards/margins_std": 1.7525379657745361, "rewards/rejected": -3.0434775352478027, "step": 1070 }, { "epoch": 0.28, "grad_norm": 13.590932269031667, "learning_rate": 4.50984596020539e-06, "logits/chosen": -2.018005847930908, "logits/rejected": -1.8978986740112305, "logps/chosen": -525.7593383789062, "logps/rejected": -612.4932861328125, "loss": 0.5982, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0514798164367676, "rewards/margins": 0.8610676527023315, "rewards/margins_max": 2.8947465419769287, "rewards/margins_min": -1.0090887546539307, "rewards/margins_std": 1.6913036108016968, "rewards/rejected": -2.9125475883483887, "step": 1080 }, { "epoch": 0.29, "grad_norm": 4.571405368132969, "learning_rate": 4.4961762529687745e-06, "logits/chosen": -2.1091721057891846, "logits/rejected": -2.0303473472595215, "logps/chosen": -383.1018981933594, "logps/rejected": -438.295654296875, "loss": 0.4822, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2824275493621826, "rewards/margins": 0.9867317080497742, "rewards/margins_max": 2.5637738704681396, "rewards/margins_min": -0.5889317393302917, "rewards/margins_std": 1.4600626230239868, "rewards/rejected": -2.2691590785980225, "step": 1090 }, { "epoch": 0.29, "grad_norm": 15.653621247311975, "learning_rate": 4.482339865589492e-06, "logits/chosen": -2.0740816593170166, "logits/rejected": -2.0654828548431396, "logps/chosen": -423.0332946777344, "logps/rejected": -478.68804931640625, "loss": 0.5538, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3518441915512085, "rewards/margins": 0.7650083303451538, "rewards/margins_max": 2.781705141067505, "rewards/margins_min": -0.697391927242279, "rewards/margins_std": 1.5958625078201294, "rewards/rejected": -2.116852283477783, "step": 1100 }, { "epoch": 0.29, "eval_logits/chosen": -2.2234253883361816, "eval_logits/rejected": -2.1858971118927, "eval_logps/chosen": -395.77325439453125, "eval_logps/rejected": -460.2347412109375, "eval_loss": 0.536051332950592, "eval_rewards/accuracies": 0.7519841194152832, "eval_rewards/chosen": -1.1128593683242798, "eval_rewards/margins": 0.8680601119995117, "eval_rewards/margins_max": 3.0506019592285156, "eval_rewards/margins_min": -0.8555098176002502, "eval_rewards/margins_std": 1.2919068336486816, "eval_rewards/rejected": -1.9809192419052124, "eval_runtime": 223.8688, "eval_samples_per_second": 8.934, "eval_steps_per_second": 0.281, "step": 1100 }, { "epoch": 0.29, "grad_norm": 7.169651326702323, "learning_rate": 4.468337953401909e-06, "logits/chosen": -2.14406156539917, "logits/rejected": -2.1683297157287598, "logps/chosen": -402.5144958496094, "logps/rejected": -515.2345581054688, "loss": 0.468, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2976680994033813, "rewards/margins": 1.301887035369873, "rewards/margins_max": 2.293175220489502, "rewards/margins_min": 0.16100965440273285, "rewards/margins_std": 0.9626598358154297, "rewards/rejected": -2.599555015563965, "step": 1110 }, { "epoch": 0.29, "grad_norm": 9.339162867930426, "learning_rate": 4.45417168556166e-06, "logits/chosen": -2.105541706085205, "logits/rejected": -2.060274600982666, "logps/chosen": -439.5116271972656, "logps/rejected": -420.0951232910156, "loss": 0.5729, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6803901195526123, "rewards/margins": 0.47014689445495605, "rewards/margins_max": 1.7376962900161743, "rewards/margins_min": -0.6755861639976501, "rewards/margins_std": 1.042885184288025, "rewards/rejected": -2.1505370140075684, "step": 1120 }, { "epoch": 0.3, "grad_norm": 7.287039489329186, "learning_rate": 4.439842244948036e-06, "logits/chosen": -2.224477529525757, "logits/rejected": -2.1767616271972656, "logps/chosen": -445.6869201660156, "logps/rejected": -497.06329345703125, "loss": 0.4956, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5076476335525513, "rewards/margins": 0.906781792640686, "rewards/margins_max": 2.278581142425537, "rewards/margins_min": -0.4451374113559723, "rewards/margins_std": 1.229607105255127, "rewards/rejected": -2.4144294261932373, "step": 1130 }, { "epoch": 0.3, "grad_norm": 11.529566696243402, "learning_rate": 4.425350828065204e-06, "logits/chosen": -2.0859618186950684, "logits/rejected": -2.0451629161834717, "logps/chosen": -401.34857177734375, "logps/rejected": -451.3365173339844, "loss": 0.5371, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7184416055679321, "rewards/margins": 0.8106294870376587, "rewards/margins_max": 2.360565662384033, "rewards/margins_min": -0.6427885890007019, "rewards/margins_std": 1.392290472984314, "rewards/rejected": -2.529071092605591, "step": 1140 }, { "epoch": 0.3, "grad_norm": 7.606390083671785, "learning_rate": 4.410698644942303e-06, "logits/chosen": -2.171191453933716, "logits/rejected": -2.148883819580078, "logps/chosen": -448.05615234375, "logps/rejected": -516.6838989257812, "loss": 0.4948, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.3839561939239502, "rewards/margins": 1.2946058511734009, "rewards/margins_max": 2.7205588817596436, "rewards/margins_min": 0.014218026772141457, "rewards/margins_std": 1.1791636943817139, "rewards/rejected": -2.6785616874694824, "step": 1150 }, { "epoch": 0.3, "grad_norm": 9.441692709374825, "learning_rate": 4.395886919032406e-06, "logits/chosen": -2.024778366088867, "logits/rejected": -1.9976590871810913, "logps/chosen": -365.15301513671875, "logps/rejected": -453.542236328125, "loss": 0.4819, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7372894287109375, "rewards/margins": 0.8484255075454712, "rewards/margins_max": 1.926300287246704, "rewards/margins_min": -0.4521838128566742, "rewards/margins_std": 1.0774040222167969, "rewards/rejected": -2.5857152938842773, "step": 1160 }, { "epoch": 0.31, "grad_norm": 5.398293698903307, "learning_rate": 4.380916887110366e-06, "logits/chosen": -2.024618625640869, "logits/rejected": -1.9719196557998657, "logps/chosen": -438.81689453125, "logps/rejected": -527.51123046875, "loss": 0.4772, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9085757732391357, "rewards/margins": 0.8110491633415222, "rewards/margins_max": 2.5685856342315674, "rewards/margins_min": -0.7955909967422485, "rewards/margins_std": 1.470815896987915, "rewards/rejected": -2.7196249961853027, "step": 1170 }, { "epoch": 0.31, "grad_norm": 9.389951862703382, "learning_rate": 4.365789799169539e-06, "logits/chosen": -1.930235505104065, "logits/rejected": -1.8814465999603271, "logps/chosen": -466.1863708496094, "logps/rejected": -547.4100952148438, "loss": 0.4995, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9667508602142334, "rewards/margins": 1.0498183965682983, "rewards/margins_max": 2.7341551780700684, "rewards/margins_min": -0.19703975319862366, "rewards/margins_std": 1.2919727563858032, "rewards/rejected": -3.0165696144104004, "step": 1180 }, { "epoch": 0.31, "grad_norm": 15.0664178968389, "learning_rate": 4.350506918317416e-06, "logits/chosen": -1.997807502746582, "logits/rejected": -1.999418020248413, "logps/chosen": -419.49383544921875, "logps/rejected": -455.7763671875, "loss": 0.5558, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.585124135017395, "rewards/margins": 0.7856301069259644, "rewards/margins_max": 1.8834917545318604, "rewards/margins_min": -0.06860219687223434, "rewards/margins_std": 0.9045690298080444, "rewards/rejected": -2.3707540035247803, "step": 1190 }, { "epoch": 0.31, "grad_norm": 7.185208958910181, "learning_rate": 4.335069520670149e-06, "logits/chosen": -1.9295597076416016, "logits/rejected": -1.8495397567749023, "logps/chosen": -361.71209716796875, "logps/rejected": -433.90618896484375, "loss": 0.5482, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3275094032287598, "rewards/margins": 0.5705887079238892, "rewards/margins_max": 1.9119141101837158, "rewards/margins_min": -0.4970221519470215, "rewards/margins_std": 1.0814253091812134, "rewards/rejected": -1.8980979919433594, "step": 1200 }, { "epoch": 0.31, "eval_logits/chosen": -2.069587469100952, "eval_logits/rejected": -2.0283474922180176, "eval_logps/chosen": -410.9884033203125, "eval_logps/rejected": -478.3761901855469, "eval_loss": 0.534513533115387, "eval_rewards/accuracies": 0.7797619104385376, "eval_rewards/chosen": -1.2650103569030762, "eval_rewards/margins": 0.897322952747345, "eval_rewards/margins_max": 3.0597646236419678, "eval_rewards/margins_min": -0.8738881945610046, "eval_rewards/margins_std": 1.2932322025299072, "eval_rewards/rejected": -2.1623332500457764, "eval_runtime": 223.8671, "eval_samples_per_second": 8.934, "eval_steps_per_second": 0.281, "step": 1200 }, { "epoch": 0.32, "grad_norm": 13.906771189320308, "learning_rate": 4.319478895246e-06, "logits/chosen": -2.059779167175293, "logits/rejected": -2.062770128250122, "logps/chosen": -456.4762268066406, "logps/rejected": -488.0137634277344, "loss": 0.5148, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4821711778640747, "rewards/margins": 0.7420075535774231, "rewards/margins_max": 2.421123504638672, "rewards/margins_min": -0.7212496995925903, "rewards/margins_std": 1.3457313776016235, "rewards/rejected": -2.2241785526275635, "step": 1210 }, { "epoch": 0.32, "grad_norm": 9.230829116431407, "learning_rate": 4.303736343857704e-06, "logits/chosen": -2.1094727516174316, "logits/rejected": -2.052184581756592, "logps/chosen": -469.91253662109375, "logps/rejected": -569.4459838867188, "loss": 0.5603, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8304897546768188, "rewards/margins": 1.169433355331421, "rewards/margins_max": 3.2296535968780518, "rewards/margins_min": -0.6863353848457336, "rewards/margins_std": 1.751387357711792, "rewards/rejected": -2.9999232292175293, "step": 1220 }, { "epoch": 0.32, "grad_norm": 12.337844092489819, "learning_rate": 4.287843181003772e-06, "logits/chosen": -2.068594217300415, "logits/rejected": -2.0841150283813477, "logps/chosen": -438.87091064453125, "logps/rejected": -511.75274658203125, "loss": 0.5693, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6375892162322998, "rewards/margins": 0.7606214284896851, "rewards/margins_max": 2.128434419631958, "rewards/margins_min": -0.48179203271865845, "rewards/margins_std": 1.1427290439605713, "rewards/rejected": -2.3982107639312744, "step": 1230 }, { "epoch": 0.32, "grad_norm": 6.914766471930062, "learning_rate": 4.27180073375873e-06, "logits/chosen": -1.8894243240356445, "logits/rejected": -1.905521035194397, "logps/chosen": -374.7861328125, "logps/rejected": -446.35614013671875, "loss": 0.4592, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1893283128738403, "rewards/margins": 1.1314045190811157, "rewards/margins_max": 2.3633322715759277, "rewards/margins_min": -0.007747170515358448, "rewards/margins_std": 1.0903398990631104, "rewards/rejected": -2.320732831954956, "step": 1240 }, { "epoch": 0.33, "grad_norm": 17.42486661534314, "learning_rate": 4.255610341662304e-06, "logits/chosen": -1.9167416095733643, "logits/rejected": -1.8940188884735107, "logps/chosen": -423.37860107421875, "logps/rejected": -517.34716796875, "loss": 0.4914, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5117312669754028, "rewards/margins": 1.1153199672698975, "rewards/margins_max": 3.2856788635253906, "rewards/margins_min": -0.6743953824043274, "rewards/margins_std": 1.847245216369629, "rewards/rejected": -2.6270511150360107, "step": 1250 }, { "epoch": 0.33, "grad_norm": 11.626595336391945, "learning_rate": 4.2392733566075764e-06, "logits/chosen": -1.899097204208374, "logits/rejected": -1.9072374105453491, "logps/chosen": -399.4856262207031, "logps/rejected": -510.67547607421875, "loss": 0.518, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6539478302001953, "rewards/margins": 1.4515551328659058, "rewards/margins_max": 2.9055306911468506, "rewards/margins_min": 0.17859798669815063, "rewards/margins_std": 1.2765491008758545, "rewards/rejected": -3.1055026054382324, "step": 1260 }, { "epoch": 0.33, "grad_norm": 8.618465436037205, "learning_rate": 4.2227911427280975e-06, "logits/chosen": -1.9961669445037842, "logits/rejected": -2.006063461303711, "logps/chosen": -416.897216796875, "logps/rejected": -490.12713623046875, "loss": 0.526, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2700196504592896, "rewards/margins": 0.6870089769363403, "rewards/margins_max": 2.146787166595459, "rewards/margins_min": -0.45557695627212524, "rewards/margins_std": 1.208411693572998, "rewards/rejected": -1.9570287466049194, "step": 1270 }, { "epoch": 0.33, "grad_norm": 5.0388079438489335, "learning_rate": 4.206165076283983e-06, "logits/chosen": -1.8056881427764893, "logits/rejected": -1.863918662071228, "logps/chosen": -375.0386962890625, "logps/rejected": -462.05841064453125, "loss": 0.5285, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5050007104873657, "rewards/margins": 0.6704990267753601, "rewards/margins_max": 1.9943397045135498, "rewards/margins_min": -0.7822316884994507, "rewards/margins_std": 1.2196003198623657, "rewards/rejected": -2.175499677658081, "step": 1280 }, { "epoch": 0.34, "grad_norm": 11.404749607011668, "learning_rate": 4.189396545546995e-06, "logits/chosen": -1.9960153102874756, "logits/rejected": -1.941457748413086, "logps/chosen": -421.644287109375, "logps/rejected": -504.11944580078125, "loss": 0.5126, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7621005773544312, "rewards/margins": 0.8665353655815125, "rewards/margins_max": 2.0453314781188965, "rewards/margins_min": -0.2223828285932541, "rewards/margins_std": 1.046208143234253, "rewards/rejected": -2.628635883331299, "step": 1290 }, { "epoch": 0.34, "grad_norm": 11.19925008715532, "learning_rate": 4.172486950684627e-06, "logits/chosen": -1.9701306819915771, "logits/rejected": -1.9696992635726929, "logps/chosen": -377.73065185546875, "logps/rejected": -483.0760803222656, "loss": 0.5325, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6942758560180664, "rewards/margins": 0.8614457845687866, "rewards/margins_max": 2.5126285552978516, "rewards/margins_min": -0.8989812731742859, "rewards/margins_std": 1.4873483180999756, "rewards/rejected": -2.5557217597961426, "step": 1300 }, { "epoch": 0.34, "eval_logits/chosen": -2.0645854473114014, "eval_logits/rejected": -2.0318853855133057, "eval_logps/chosen": -419.38128662109375, "eval_logps/rejected": -487.63275146484375, "eval_loss": 0.5236928462982178, "eval_rewards/accuracies": 0.7539682388305664, "eval_rewards/chosen": -1.3489394187927246, "eval_rewards/margins": 0.9059598445892334, "eval_rewards/margins_max": 2.928480625152588, "eval_rewards/margins_min": -0.899970293045044, "eval_rewards/margins_std": 1.268829107284546, "eval_rewards/rejected": -2.254899263381958, "eval_runtime": 223.8998, "eval_samples_per_second": 8.933, "eval_steps_per_second": 0.281, "step": 1300 }, { "epoch": 0.34, "grad_norm": 12.151540751153869, "learning_rate": 4.155437703643182e-06, "logits/chosen": -1.9826958179473877, "logits/rejected": -1.9572885036468506, "logps/chosen": -414.15191650390625, "logps/rejected": -453.84515380859375, "loss": 0.4819, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1796549558639526, "rewards/margins": 0.675355851650238, "rewards/margins_max": 1.6184173822402954, "rewards/margins_min": -0.36879628896713257, "rewards/margins_std": 0.885777473449707, "rewards/rejected": -1.855010986328125, "step": 1310 }, { "epoch": 0.35, "grad_norm": 4.5357332347220884, "learning_rate": 4.138250228029882e-06, "logits/chosen": -2.0476813316345215, "logits/rejected": -1.9941253662109375, "logps/chosen": -357.07440185546875, "logps/rejected": -394.1534118652344, "loss": 0.5234, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8876716494560242, "rewards/margins": 0.447756826877594, "rewards/margins_max": 1.9389146566390991, "rewards/margins_min": -0.7519620060920715, "rewards/margins_std": 1.2010565996170044, "rewards/rejected": -1.3354284763336182, "step": 1320 }, { "epoch": 0.35, "grad_norm": 11.361385707961777, "learning_rate": 4.120925958993994e-06, "logits/chosen": -2.061697483062744, "logits/rejected": -2.083584785461426, "logps/chosen": -355.8453369140625, "logps/rejected": -438.54248046875, "loss": 0.5336, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8062282800674438, "rewards/margins": 0.9374510645866394, "rewards/margins_max": 2.081394672393799, "rewards/margins_min": -0.09482260793447495, "rewards/margins_std": 1.0059430599212646, "rewards/rejected": -1.7436792850494385, "step": 1330 }, { "epoch": 0.35, "grad_norm": 8.17415923948518, "learning_rate": 4.103466343106999e-06, "logits/chosen": -1.7707080841064453, "logits/rejected": -1.7096986770629883, "logps/chosen": -462.4939880371094, "logps/rejected": -470.41290283203125, "loss": 0.5308, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2166173458099365, "rewards/margins": 0.8438111543655396, "rewards/margins_max": 2.3620476722717285, "rewards/margins_min": -0.7409253716468811, "rewards/margins_std": 1.4221327304840088, "rewards/rejected": -2.0604286193847656, "step": 1340 }, { "epoch": 0.35, "grad_norm": 7.1374755858957455, "learning_rate": 4.085872838241797e-06, "logits/chosen": -2.003756523132324, "logits/rejected": -2.0223968029022217, "logps/chosen": -397.8870849609375, "logps/rejected": -440.9349060058594, "loss": 0.5145, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0153266191482544, "rewards/margins": 0.9000941514968872, "rewards/margins_max": 2.74387788772583, "rewards/margins_min": -0.9317896962165833, "rewards/margins_std": 1.5992034673690796, "rewards/rejected": -1.9154205322265625, "step": 1350 }, { "epoch": 0.36, "grad_norm": 5.50888799893983, "learning_rate": 4.06814691345098e-06, "logits/chosen": -1.7533172369003296, "logits/rejected": -1.7561861276626587, "logps/chosen": -388.37200927734375, "logps/rejected": -434.9646911621094, "loss": 0.5333, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7110679149627686, "rewards/margins": 0.6225708723068237, "rewards/margins_max": 2.168288469314575, "rewards/margins_min": -0.8465573191642761, "rewards/margins_std": 1.3425003290176392, "rewards/rejected": -2.3336386680603027, "step": 1360 }, { "epoch": 0.36, "grad_norm": 7.858577012794086, "learning_rate": 4.050290048844171e-06, "logits/chosen": -2.0332846641540527, "logits/rejected": -2.0102474689483643, "logps/chosen": -412.0020446777344, "logps/rejected": -440.4637145996094, "loss": 0.5367, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3085713386535645, "rewards/margins": 0.6355219483375549, "rewards/margins_max": 2.065211534500122, "rewards/margins_min": -0.8187443017959595, "rewards/margins_std": 1.2678159475326538, "rewards/rejected": -1.9440934658050537, "step": 1370 }, { "epoch": 0.36, "grad_norm": 9.184695691678112, "learning_rate": 4.032303735464422e-06, "logits/chosen": -2.028918981552124, "logits/rejected": -2.0585622787475586, "logps/chosen": -406.294921875, "logps/rejected": -503.935791015625, "loss": 0.4599, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1892098188400269, "rewards/margins": 1.137403130531311, "rewards/margins_max": 2.7278029918670654, "rewards/margins_min": -0.06611676514148712, "rewards/margins_std": 1.2650660276412964, "rewards/rejected": -2.326613187789917, "step": 1380 }, { "epoch": 0.36, "grad_norm": 11.893931506842181, "learning_rate": 4.014189475163727e-06, "logits/chosen": -2.051037549972534, "logits/rejected": -1.9636198282241821, "logps/chosen": -467.09967041015625, "logps/rejected": -523.681396484375, "loss": 0.5962, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.976880431175232, "rewards/margins": 1.1242797374725342, "rewards/margins_max": 2.6586754322052, "rewards/margins_min": -0.5520067811012268, "rewards/margins_std": 1.4198178052902222, "rewards/rejected": -3.1011600494384766, "step": 1390 }, { "epoch": 0.37, "grad_norm": 6.345930354311145, "learning_rate": 3.995948780477605e-06, "logits/chosen": -1.9968942403793335, "logits/rejected": -1.9305909872055054, "logps/chosen": -475.56085205078125, "logps/rejected": -509.7543029785156, "loss": 0.5647, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9790456295013428, "rewards/margins": 0.910565972328186, "rewards/margins_max": 2.3425941467285156, "rewards/margins_min": -0.9012687802314758, "rewards/margins_std": 1.4871008396148682, "rewards/rejected": -2.8896117210388184, "step": 1400 }, { "epoch": 0.37, "eval_logits/chosen": -2.0808329582214355, "eval_logits/rejected": -2.0498783588409424, "eval_logps/chosen": -465.0506896972656, "eval_logps/rejected": -539.43212890625, "eval_loss": 0.5171133875846863, "eval_rewards/accuracies": 0.773809552192688, "eval_rewards/chosen": -1.805633544921875, "eval_rewards/margins": 0.9672591686248779, "eval_rewards/margins_max": 3.031038999557495, "eval_rewards/margins_min": -0.919063150882721, "eval_rewards/margins_std": 1.3055399656295776, "eval_rewards/rejected": -2.772892713546753, "eval_runtime": 223.8236, "eval_samples_per_second": 8.936, "eval_steps_per_second": 0.281, "step": 1400 }, { "epoch": 0.37, "grad_norm": 6.816244513661429, "learning_rate": 3.977583174498816e-06, "logits/chosen": -1.972318410873413, "logits/rejected": -2.0125961303710938, "logps/chosen": -400.193603515625, "logps/rejected": -509.456787109375, "loss": 0.4944, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6972711086273193, "rewards/margins": 0.9939779043197632, "rewards/margins_max": 2.771601915359497, "rewards/margins_min": -0.3052237927913666, "rewards/margins_std": 1.357157826423645, "rewards/rejected": -2.691248893737793, "step": 1410 }, { "epoch": 0.37, "grad_norm": 7.9524244438354845, "learning_rate": 3.959094190750172e-06, "logits/chosen": -2.0595555305480957, "logits/rejected": -1.9970805644989014, "logps/chosen": -430.1922912597656, "logps/rejected": -461.574951171875, "loss": 0.5267, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8115689754486084, "rewards/margins": 0.8633396029472351, "rewards/margins_max": 1.8406412601470947, "rewards/margins_min": -0.153560608625412, "rewards/margins_std": 0.8802895545959473, "rewards/rejected": -2.6749088764190674, "step": 1420 }, { "epoch": 0.37, "grad_norm": 6.973053428813933, "learning_rate": 3.9404833730564975e-06, "logits/chosen": -2.043400764465332, "logits/rejected": -2.091839551925659, "logps/chosen": -410.3401794433594, "logps/rejected": -524.8173217773438, "loss": 0.5199, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.006483554840088, "rewards/margins": 0.9542981386184692, "rewards/margins_max": 2.1359915733337402, "rewards/margins_min": -0.15348154306411743, "rewards/margins_std": 0.9994543790817261, "rewards/rejected": -2.960782051086426, "step": 1430 }, { "epoch": 0.38, "grad_norm": 7.6814829531078646, "learning_rate": 3.921752275415712e-06, "logits/chosen": -2.0621848106384277, "logits/rejected": -2.0505011081695557, "logps/chosen": -468.862548828125, "logps/rejected": -570.4606323242188, "loss": 0.4706, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.397446870803833, "rewards/margins": 1.0605485439300537, "rewards/margins_max": 2.5342929363250732, "rewards/margins_min": -0.3334781527519226, "rewards/margins_std": 1.2648417949676514, "rewards/rejected": -2.457995653152466, "step": 1440 }, { "epoch": 0.38, "grad_norm": 6.920663559691752, "learning_rate": 3.902902461869079e-06, "logits/chosen": -2.01850962638855, "logits/rejected": -1.92391836643219, "logps/chosen": -473.7357482910156, "logps/rejected": -524.8150634765625, "loss": 0.4816, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5427310466766357, "rewards/margins": 1.042557954788208, "rewards/margins_max": 2.4326138496398926, "rewards/margins_min": -0.3859913945198059, "rewards/margins_std": 1.2757055759429932, "rewards/rejected": -2.5852887630462646, "step": 1450 }, { "epoch": 0.38, "grad_norm": 8.32572722136922, "learning_rate": 3.883935506370605e-06, "logits/chosen": -1.9135633707046509, "logits/rejected": -1.8493874073028564, "logps/chosen": -495.3169860839844, "logps/rejected": -568.623779296875, "loss": 0.4405, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9543468952178955, "rewards/margins": 1.3293737173080444, "rewards/margins_max": 3.064760684967041, "rewards/margins_min": -0.2759544253349304, "rewards/margins_std": 1.509350299835205, "rewards/rejected": -3.2837207317352295, "step": 1460 }, { "epoch": 0.38, "grad_norm": 8.941151973230207, "learning_rate": 3.864852992655617e-06, "logits/chosen": -1.8653805255889893, "logits/rejected": -1.83843994140625, "logps/chosen": -465.6788024902344, "logps/rejected": -584.3350830078125, "loss": 0.4704, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.1350064277648926, "rewards/margins": 1.3587796688079834, "rewards/margins_max": 2.793201446533203, "rewards/margins_min": 0.12348320335149765, "rewards/margins_std": 1.1955536603927612, "rewards/rejected": -3.493785858154297, "step": 1470 }, { "epoch": 0.39, "grad_norm": 16.37087238501615, "learning_rate": 3.845656514108516e-06, "logits/chosen": -1.9313055276870728, "logits/rejected": -1.8489468097686768, "logps/chosen": -444.101318359375, "logps/rejected": -591.8146362304688, "loss": 0.4697, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.059746026992798, "rewards/margins": 1.1716073751449585, "rewards/margins_max": 2.602508068084717, "rewards/margins_min": 0.07555189728736877, "rewards/margins_std": 1.1485735177993774, "rewards/rejected": -3.231353282928467, "step": 1480 }, { "epoch": 0.39, "grad_norm": 6.909663534847042, "learning_rate": 3.826347673629738e-06, "logits/chosen": -1.8883874416351318, "logits/rejected": -1.9392074346542358, "logps/chosen": -482.60870361328125, "logps/rejected": -553.0533447265625, "loss": 0.6008, "rewards/accuracies": 0.75, "rewards/chosen": -2.0737993717193604, "rewards/margins": 1.0460944175720215, "rewards/margins_max": 3.09870982170105, "rewards/margins_min": -0.5803232789039612, "rewards/margins_std": 1.6999595165252686, "rewards/rejected": -3.119893789291382, "step": 1490 }, { "epoch": 0.39, "grad_norm": 7.161552497013512, "learning_rate": 3.8069280835019062e-06, "logits/chosen": -2.228004217147827, "logits/rejected": -2.1793220043182373, "logps/chosen": -354.55267333984375, "logps/rejected": -397.83502197265625, "loss": 0.5458, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1488611698150635, "rewards/margins": 0.826683521270752, "rewards/margins_max": 2.199134349822998, "rewards/margins_min": -0.49612870812416077, "rewards/margins_std": 1.236102819442749, "rewards/rejected": -1.9755443334579468, "step": 1500 }, { "epoch": 0.39, "eval_logits/chosen": -2.178837537765503, "eval_logits/rejected": -2.1490015983581543, "eval_logps/chosen": -424.5413818359375, "eval_logps/rejected": -492.93994140625, "eval_loss": 0.5139278769493103, "eval_rewards/accuracies": 0.7658730149269104, "eval_rewards/chosen": -1.4005405902862549, "eval_rewards/margins": 0.9074305295944214, "eval_rewards/margins_max": 2.881455421447754, "eval_rewards/margins_min": -0.9357819557189941, "eval_rewards/margins_std": 1.2687016725540161, "eval_rewards/rejected": -2.3079710006713867, "eval_runtime": 223.7758, "eval_samples_per_second": 8.938, "eval_steps_per_second": 0.282, "step": 1500 }, { "epoch": 0.4, "grad_norm": 10.626824435503195, "learning_rate": 3.7873993652552077e-06, "logits/chosen": -2.212127923965454, "logits/rejected": -2.1580100059509277, "logps/chosen": -454.6893005371094, "logps/rejected": -499.0372619628906, "loss": 0.5255, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.506762146949768, "rewards/margins": 0.8889607191085815, "rewards/margins_max": 2.3762519359588623, "rewards/margins_min": -0.6295033693313599, "rewards/margins_std": 1.3176841735839844, "rewards/rejected": -2.3957228660583496, "step": 1510 }, { "epoch": 0.4, "grad_norm": 6.774002350835221, "learning_rate": 3.7677631495319953e-06, "logits/chosen": -2.1804816722869873, "logits/rejected": -2.1550958156585693, "logps/chosen": -412.61083984375, "logps/rejected": -454.95623779296875, "loss": 0.501, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2692925930023193, "rewards/margins": 1.10371994972229, "rewards/margins_max": 2.362191677093506, "rewards/margins_min": -0.6144388914108276, "rewards/margins_std": 1.3707691431045532, "rewards/rejected": -2.3730125427246094, "step": 1520 }, { "epoch": 0.4, "grad_norm": 6.747507270845818, "learning_rate": 3.748021075950633e-06, "logits/chosen": -2.08042049407959, "logits/rejected": -2.1243879795074463, "logps/chosen": -429.24365234375, "logps/rejected": -468.5777282714844, "loss": 0.4686, "rewards/accuracies": 0.75, "rewards/chosen": -1.51658034324646, "rewards/margins": 0.7647795081138611, "rewards/margins_max": 1.939208745956421, "rewards/margins_min": -0.41800689697265625, "rewards/margins_std": 1.0857430696487427, "rewards/rejected": -2.281359910964966, "step": 1530 }, { "epoch": 0.4, "grad_norm": 11.818091951041897, "learning_rate": 3.7281747929685824e-06, "logits/chosen": -2.080784797668457, "logits/rejected": -2.0647099018096924, "logps/chosen": -569.0308837890625, "logps/rejected": -623.1643676757812, "loss": 0.542, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9131240844726562, "rewards/margins": 0.5408791303634644, "rewards/margins_max": 1.6401796340942383, "rewards/margins_min": -0.478179931640625, "rewards/margins_std": 0.9260808825492859, "rewards/rejected": -2.4540035724639893, "step": 1540 }, { "epoch": 0.41, "grad_norm": 10.060654370239636, "learning_rate": 3.7082259577447604e-06, "logits/chosen": -2.1240007877349854, "logits/rejected": -2.180757999420166, "logps/chosen": -482.7552795410156, "logps/rejected": -566.1520385742188, "loss": 0.5157, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7559068202972412, "rewards/margins": 1.088444471359253, "rewards/margins_max": 2.7113475799560547, "rewards/margins_min": -0.2167981117963791, "rewards/margins_std": 1.3477070331573486, "rewards/rejected": -2.8443515300750732, "step": 1550 }, { "epoch": 0.41, "grad_norm": 8.37581528068826, "learning_rate": 3.6881762360011688e-06, "logits/chosen": -2.070563793182373, "logits/rejected": -2.0653250217437744, "logps/chosen": -491.2327575683594, "logps/rejected": -544.5135498046875, "loss": 0.5522, "rewards/accuracies": 0.625, "rewards/chosen": -2.208178997039795, "rewards/margins": 0.7078807950019836, "rewards/margins_max": 2.584287166595459, "rewards/margins_min": -0.8918627500534058, "rewards/margins_std": 1.5139882564544678, "rewards/rejected": -2.916059970855713, "step": 1560 }, { "epoch": 0.41, "grad_norm": 8.147820453925949, "learning_rate": 3.668027301883802e-06, "logits/chosen": -2.1793155670166016, "logits/rejected": -2.197404623031616, "logps/chosen": -528.0721435546875, "logps/rejected": -523.2158203125, "loss": 0.491, "rewards/accuracies": 0.75, "rewards/chosen": -1.6871109008789062, "rewards/margins": 0.9497402906417847, "rewards/margins_max": 2.2161099910736084, "rewards/margins_min": -0.4706048369407654, "rewards/margins_std": 1.206055998802185, "rewards/rejected": -2.6368513107299805, "step": 1570 }, { "epoch": 0.41, "grad_norm": 10.319218706822639, "learning_rate": 3.64778083782286e-06, "logits/chosen": -2.1771371364593506, "logits/rejected": -2.233433961868286, "logps/chosen": -410.7242126464844, "logps/rejected": -451.05596923828125, "loss": 0.5243, "rewards/accuracies": 0.75, "rewards/chosen": -1.4275891780853271, "rewards/margins": 0.5667048692703247, "rewards/margins_max": 1.929448127746582, "rewards/margins_min": -0.4389796853065491, "rewards/margins_std": 1.0566939115524292, "rewards/rejected": -1.9942939281463623, "step": 1580 }, { "epoch": 0.42, "grad_norm": 7.857353663403275, "learning_rate": 3.627438534392268e-06, "logits/chosen": -2.2144012451171875, "logits/rejected": -2.259129762649536, "logps/chosen": -440.6143493652344, "logps/rejected": -476.25885009765625, "loss": 0.5274, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.326859474182129, "rewards/margins": 0.5321208238601685, "rewards/margins_max": 2.1661665439605713, "rewards/margins_min": -0.7159594297409058, "rewards/margins_std": 1.2664140462875366, "rewards/rejected": -1.8589801788330078, "step": 1590 }, { "epoch": 0.42, "grad_norm": 5.877566811698236, "learning_rate": 3.607002090168506e-06, "logits/chosen": -2.068042516708374, "logits/rejected": -2.124753713607788, "logps/chosen": -359.75830078125, "logps/rejected": -388.678466796875, "loss": 0.4935, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0795767307281494, "rewards/margins": 0.7632505297660828, "rewards/margins_max": 2.173703193664551, "rewards/margins_min": -0.24904045462608337, "rewards/margins_std": 1.074561595916748, "rewards/rejected": -1.8428270816802979, "step": 1600 }, { "epoch": 0.42, "eval_logits/chosen": -2.1057608127593994, "eval_logits/rejected": -2.0721218585968018, "eval_logps/chosen": -425.83367919921875, "eval_logps/rejected": -504.05157470703125, "eval_loss": 0.5159334540367126, "eval_rewards/accuracies": 0.761904776096344, "eval_rewards/chosen": -1.4134632349014282, "eval_rewards/margins": 1.0056246519088745, "eval_rewards/margins_max": 3.194700002670288, "eval_rewards/margins_min": -0.8547495007514954, "eval_rewards/margins_std": 1.3593833446502686, "eval_rewards/rejected": -2.4190876483917236, "eval_runtime": 223.8382, "eval_samples_per_second": 8.935, "eval_steps_per_second": 0.281, "step": 1600 }, { "epoch": 0.42, "grad_norm": 11.499224429307654, "learning_rate": 3.586473211588787e-06, "logits/chosen": -1.8698234558105469, "logits/rejected": -1.9080699682235718, "logps/chosen": -437.06365966796875, "logps/rejected": -508.5115661621094, "loss": 0.5604, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.619508147239685, "rewards/margins": 0.9831677675247192, "rewards/margins_max": 2.4984354972839355, "rewards/margins_min": -0.5899962186813354, "rewards/margins_std": 1.3955860137939453, "rewards/rejected": -2.6026759147644043, "step": 1610 }, { "epoch": 0.42, "grad_norm": 8.164780171805377, "learning_rate": 3.5658536128085623e-06, "logits/chosen": -1.9961984157562256, "logits/rejected": -1.9654277563095093, "logps/chosen": -464.88623046875, "logps/rejected": -485.64599609375, "loss": 0.5767, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.7909812927246094, "rewards/margins": 0.6215311288833618, "rewards/margins_max": 2.0692086219787598, "rewards/margins_min": -0.7079448103904724, "rewards/margins_std": 1.2543765306472778, "rewards/rejected": -2.4125123023986816, "step": 1620 }, { "epoch": 0.43, "grad_norm": 9.546904168966282, "learning_rate": 3.545145015558399e-06, "logits/chosen": -2.21730375289917, "logits/rejected": -2.130976915359497, "logps/chosen": -461.41790771484375, "logps/rejected": -528.2985229492188, "loss": 0.4593, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.637798547744751, "rewards/margins": 1.0834376811981201, "rewards/margins_max": 2.2293896675109863, "rewards/margins_min": -0.19908545911312103, "rewards/margins_std": 1.0699645280838013, "rewards/rejected": -2.721235752105713, "step": 1630 }, { "epoch": 0.43, "grad_norm": 9.070134597334789, "learning_rate": 3.5243491490002056e-06, "logits/chosen": -2.1969339847564697, "logits/rejected": -2.1817612648010254, "logps/chosen": -426.12164306640625, "logps/rejected": -473.22802734375, "loss": 0.4997, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4333113431930542, "rewards/margins": 0.9363875389099121, "rewards/margins_max": 2.291294574737549, "rewards/margins_min": -0.4111050069332123, "rewards/margins_std": 1.1765506267547607, "rewards/rejected": -2.369699001312256, "step": 1640 }, { "epoch": 0.43, "grad_norm": 8.884501783906748, "learning_rate": 3.503467749582857e-06, "logits/chosen": -2.183750629425049, "logits/rejected": -2.071338176727295, "logps/chosen": -532.7021484375, "logps/rejected": -544.22412109375, "loss": 0.518, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4182589054107666, "rewards/margins": 1.160953164100647, "rewards/margins_max": 2.595498561859131, "rewards/margins_min": -0.4770776331424713, "rewards/margins_std": 1.3554044961929321, "rewards/rejected": -2.579211950302124, "step": 1650 }, { "epoch": 0.43, "grad_norm": 12.940291393837384, "learning_rate": 3.4825025608971947e-06, "logits/chosen": -2.0827746391296387, "logits/rejected": -2.0338656902313232, "logps/chosen": -430.08538818359375, "logps/rejected": -588.5921630859375, "loss": 0.4789, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7897369861602783, "rewards/margins": 1.236501932144165, "rewards/margins_max": 2.7483270168304443, "rewards/margins_min": 0.003545474959537387, "rewards/margins_std": 1.234513759613037, "rewards/rejected": -3.0262391567230225, "step": 1660 }, { "epoch": 0.44, "grad_norm": 7.832251643174009, "learning_rate": 3.4614553335304407e-06, "logits/chosen": -2.1380209922790527, "logits/rejected": -2.114243984222412, "logps/chosen": -443.915771484375, "logps/rejected": -506.06463623046875, "loss": 0.4604, "rewards/accuracies": 0.75, "rewards/chosen": -1.8110405206680298, "rewards/margins": 0.9482585787773132, "rewards/margins_max": 2.5312867164611816, "rewards/margins_min": -0.37763699889183044, "rewards/margins_std": 1.3206779956817627, "rewards/rejected": -2.7592990398406982, "step": 1670 }, { "epoch": 0.44, "grad_norm": 19.03124839644903, "learning_rate": 3.4403278249200222e-06, "logits/chosen": -1.9920543432235718, "logits/rejected": -1.9049437046051025, "logps/chosen": -527.3836669921875, "logps/rejected": -553.0143432617188, "loss": 0.5054, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1665520668029785, "rewards/margins": 0.8130547404289246, "rewards/margins_max": 2.256788730621338, "rewards/margins_min": -0.560214638710022, "rewards/margins_std": 1.2929614782333374, "rewards/rejected": -2.9796066284179688, "step": 1680 }, { "epoch": 0.44, "grad_norm": 6.883692569302603, "learning_rate": 3.4191217992068293e-06, "logits/chosen": -1.8879493474960327, "logits/rejected": -1.9312360286712646, "logps/chosen": -440.9810485839844, "logps/rejected": -521.66162109375, "loss": 0.5094, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.004992723464966, "rewards/margins": 0.6570359468460083, "rewards/margins_max": 1.8213649988174438, "rewards/margins_min": -0.5460115075111389, "rewards/margins_std": 1.0328733921051025, "rewards/rejected": -2.6620283126831055, "step": 1690 }, { "epoch": 0.44, "grad_norm": 9.232416710187364, "learning_rate": 3.3978390270879056e-06, "logits/chosen": -2.0690340995788574, "logits/rejected": -2.0502989292144775, "logps/chosen": -407.5555419921875, "logps/rejected": -522.8798217773438, "loss": 0.4832, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9642928838729858, "rewards/margins": 0.6449558734893799, "rewards/margins_max": 2.126739501953125, "rewards/margins_min": -0.9990302324295044, "rewards/margins_std": 1.398895025253296, "rewards/rejected": -2.609248638153076, "step": 1700 }, { "epoch": 0.44, "eval_logits/chosen": -2.1797406673431396, "eval_logits/rejected": -2.143359422683716, "eval_logps/chosen": -440.4306335449219, "eval_logps/rejected": -522.9041748046875, "eval_loss": 0.5181602239608765, "eval_rewards/accuracies": 0.7579365372657776, "eval_rewards/chosen": -1.5594329833984375, "eval_rewards/margins": 1.0481810569763184, "eval_rewards/margins_max": 3.386108160018921, "eval_rewards/margins_min": -0.899835467338562, "eval_rewards/margins_std": 1.4429032802581787, "eval_rewards/rejected": -2.6076138019561768, "eval_runtime": 223.8721, "eval_samples_per_second": 8.934, "eval_steps_per_second": 0.281, "step": 1700 }, { "epoch": 0.45, "grad_norm": 11.294283367792406, "learning_rate": 3.3764812856685995e-06, "logits/chosen": -2.2123730182647705, "logits/rejected": -2.164630651473999, "logps/chosen": -384.6986999511719, "logps/rejected": -443.2933654785156, "loss": 0.5277, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2955601215362549, "rewards/margins": 0.7131011486053467, "rewards/margins_max": 2.364652156829834, "rewards/margins_min": -0.4386306405067444, "rewards/margins_std": 1.2738852500915527, "rewards/rejected": -2.0086612701416016, "step": 1710 }, { "epoch": 0.45, "grad_norm": 10.60686569989391, "learning_rate": 3.3550503583141726e-06, "logits/chosen": -2.1390128135681152, "logits/rejected": -2.119455575942993, "logps/chosen": -393.8223571777344, "logps/rejected": -508.4073181152344, "loss": 0.47, "rewards/accuracies": 0.75, "rewards/chosen": -1.3502625226974487, "rewards/margins": 0.8910351991653442, "rewards/margins_max": 2.761226177215576, "rewards/margins_min": -0.42571598291397095, "rewards/margins_std": 1.4158960580825806, "rewards/rejected": -2.241297960281372, "step": 1720 }, { "epoch": 0.45, "grad_norm": 7.888777533567612, "learning_rate": 3.3335480345008907e-06, "logits/chosen": -2.1611742973327637, "logits/rejected": -2.040703296661377, "logps/chosen": -461.6119689941406, "logps/rejected": -589.0175170898438, "loss": 0.5261, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8201265335083008, "rewards/margins": 1.0541011095046997, "rewards/margins_max": 3.0273349285125732, "rewards/margins_min": -0.41288620233535767, "rewards/margins_std": 1.5136767625808716, "rewards/rejected": -2.874227523803711, "step": 1730 }, { "epoch": 0.46, "grad_norm": 12.677566924493043, "learning_rate": 3.3119761096666055e-06, "logits/chosen": -2.093985080718994, "logits/rejected": -2.0197367668151855, "logps/chosen": -514.1800537109375, "logps/rejected": -546.2353515625, "loss": 0.5236, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.912182092666626, "rewards/margins": 1.14272141456604, "rewards/margins_max": 2.5932323932647705, "rewards/margins_min": -0.10396204143762589, "rewards/margins_std": 1.2072653770446777, "rewards/rejected": -3.054903268814087, "step": 1740 }, { "epoch": 0.46, "grad_norm": 7.887009198233357, "learning_rate": 3.290336385060832e-06, "logits/chosen": -2.0448098182678223, "logits/rejected": -2.0342841148376465, "logps/chosen": -438.9112854003906, "logps/rejected": -565.2920532226562, "loss": 0.5043, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6520925760269165, "rewards/margins": 1.3350918292999268, "rewards/margins_max": 2.6565215587615967, "rewards/margins_min": -0.3022043704986572, "rewards/margins_std": 1.3237862586975098, "rewards/rejected": -2.987184524536133, "step": 1750 }, { "epoch": 0.46, "grad_norm": 7.978267715319203, "learning_rate": 3.268630667594348e-06, "logits/chosen": -1.9270389080047607, "logits/rejected": -1.8770440816879272, "logps/chosen": -410.19427490234375, "logps/rejected": -491.31109619140625, "loss": 0.4548, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3573577404022217, "rewards/margins": 1.1264293193817139, "rewards/margins_max": 2.328688859939575, "rewards/margins_min": -0.0720142275094986, "rewards/margins_std": 1.083008050918579, "rewards/rejected": -2.4837870597839355, "step": 1760 }, { "epoch": 0.46, "grad_norm": 13.1018163921434, "learning_rate": 3.2468607696883147e-06, "logits/chosen": -2.0601601600646973, "logits/rejected": -2.050075054168701, "logps/chosen": -473.0166931152344, "logps/rejected": -532.3809204101562, "loss": 0.4933, "rewards/accuracies": 0.75, "rewards/chosen": -1.6705472469329834, "rewards/margins": 0.7042300701141357, "rewards/margins_max": 1.9703298807144165, "rewards/margins_min": -0.5379226803779602, "rewards/margins_std": 1.1098196506500244, "rewards/rejected": -2.37477707862854, "step": 1770 }, { "epoch": 0.47, "grad_norm": 9.199945234052771, "learning_rate": 3.225028509122944e-06, "logits/chosen": -1.9652551412582397, "logits/rejected": -1.8570070266723633, "logps/chosen": -463.224853515625, "logps/rejected": -538.2740478515625, "loss": 0.4965, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8046432733535767, "rewards/margins": 0.988460898399353, "rewards/margins_max": 2.729205846786499, "rewards/margins_min": -0.661435604095459, "rewards/margins_std": 1.5274083614349365, "rewards/rejected": -2.793104410171509, "step": 1780 }, { "epoch": 0.47, "grad_norm": 9.335411203784759, "learning_rate": 3.2031357088857083e-06, "logits/chosen": -2.024796962738037, "logits/rejected": -1.9398425817489624, "logps/chosen": -442.040771484375, "logps/rejected": -475.92041015625, "loss": 0.5018, "rewards/accuracies": 0.75, "rewards/chosen": -1.5755566358566284, "rewards/margins": 0.8600144386291504, "rewards/margins_max": 2.7490806579589844, "rewards/margins_min": -0.5487983226776123, "rewards/margins_std": 1.4523534774780273, "rewards/rejected": -2.4355709552764893, "step": 1790 }, { "epoch": 0.47, "grad_norm": 12.309100151237828, "learning_rate": 3.181184197019127e-06, "logits/chosen": -2.0196642875671387, "logits/rejected": -1.9219319820404053, "logps/chosen": -535.178955078125, "logps/rejected": -567.5675048828125, "loss": 0.5158, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8506438732147217, "rewards/margins": 0.7834941148757935, "rewards/margins_max": 2.018667697906494, "rewards/margins_min": -0.4769282341003418, "rewards/margins_std": 1.1380289793014526, "rewards/rejected": -2.634138345718384, "step": 1800 }, { "epoch": 0.47, "eval_logits/chosen": -2.0014665126800537, "eval_logits/rejected": -1.9599779844284058, "eval_logps/chosen": -458.75299072265625, "eval_logps/rejected": -550.3890380859375, "eval_loss": 0.5180644392967224, "eval_rewards/accuracies": 0.7638888955116272, "eval_rewards/chosen": -1.7426563501358032, "eval_rewards/margins": 1.1398049592971802, "eval_rewards/margins_max": 3.5508368015289307, "eval_rewards/margins_min": -0.974053144454956, "eval_rewards/margins_std": 1.517680048942566, "eval_rewards/rejected": -2.8824613094329834, "eval_runtime": 223.8294, "eval_samples_per_second": 8.935, "eval_steps_per_second": 0.281, "step": 1800 }, { "epoch": 0.47, "grad_norm": 11.196757146846972, "learning_rate": 3.159175806468126e-06, "logits/chosen": -1.9812654256820679, "logits/rejected": -1.9948698282241821, "logps/chosen": -476.88232421875, "logps/rejected": -558.1580200195312, "loss": 0.5298, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4383678436279297, "rewards/margins": 1.2237685918807983, "rewards/margins_max": 2.7056071758270264, "rewards/margins_min": -0.15089254081249237, "rewards/margins_std": 1.3079346418380737, "rewards/rejected": -2.6621365547180176, "step": 1810 }, { "epoch": 0.48, "grad_norm": 9.407903684317693, "learning_rate": 3.1371123749269804e-06, "logits/chosen": -1.8965126276016235, "logits/rejected": -1.772735834121704, "logps/chosen": -487.4578552246094, "logps/rejected": -624.8616333007812, "loss": 0.5764, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9612205028533936, "rewards/margins": 1.360779881477356, "rewards/margins_max": 3.153636932373047, "rewards/margins_min": -0.5554906129837036, "rewards/margins_std": 1.6733920574188232, "rewards/rejected": -3.322000503540039, "step": 1820 }, { "epoch": 0.48, "grad_norm": 5.294127165093935, "learning_rate": 3.114995744685877e-06, "logits/chosen": -1.9037764072418213, "logits/rejected": -1.8494529724121094, "logps/chosen": -504.7392578125, "logps/rejected": -753.9464111328125, "loss": 0.541, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8607933521270752, "rewards/margins": 1.5471004247665405, "rewards/margins_max": 3.983142137527466, "rewards/margins_min": -0.65510094165802, "rewards/margins_std": 2.1114072799682617, "rewards/rejected": -3.407893657684326, "step": 1830 }, { "epoch": 0.48, "grad_norm": 9.425881617265055, "learning_rate": 3.0928277624770743e-06, "logits/chosen": -1.8955895900726318, "logits/rejected": -1.8620665073394775, "logps/chosen": -407.10809326171875, "logps/rejected": -497.3876037597656, "loss": 0.5459, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6045745611190796, "rewards/margins": 0.9568144679069519, "rewards/margins_max": 2.3786466121673584, "rewards/margins_min": -0.4260714054107666, "rewards/margins_std": 1.2618368864059448, "rewards/rejected": -2.561389207839966, "step": 1840 }, { "epoch": 0.48, "grad_norm": 6.338861696120137, "learning_rate": 3.070610279320708e-06, "logits/chosen": -1.899151086807251, "logits/rejected": -1.7995870113372803, "logps/chosen": -442.52032470703125, "logps/rejected": -523.7626953125, "loss": 0.4574, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.051168918609619, "rewards/margins": 1.0194826126098633, "rewards/margins_max": 2.612197160720825, "rewards/margins_min": -0.7694332599639893, "rewards/margins_std": 1.4951496124267578, "rewards/rejected": -3.0706515312194824, "step": 1850 }, { "epoch": 0.49, "grad_norm": 6.983079896878698, "learning_rate": 3.0483451503702264e-06, "logits/chosen": -1.841956377029419, "logits/rejected": -1.7780545949935913, "logps/chosen": -428.9749450683594, "logps/rejected": -581.4650268554688, "loss": 0.4709, "rewards/accuracies": 0.875, "rewards/chosen": -1.4378662109375, "rewards/margins": 1.725465178489685, "rewards/margins_max": 3.278308153152466, "rewards/margins_min": 0.23476390540599823, "rewards/margins_std": 1.3492926359176636, "rewards/rejected": -3.1633317470550537, "step": 1860 }, { "epoch": 0.49, "grad_norm": 5.673615327872685, "learning_rate": 3.0260342347574916e-06, "logits/chosen": -1.8059120178222656, "logits/rejected": -1.8321125507354736, "logps/chosen": -420.679443359375, "logps/rejected": -551.1641845703125, "loss": 0.4771, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4513155221939087, "rewards/margins": 1.2248870134353638, "rewards/margins_max": 2.786269187927246, "rewards/margins_min": -0.00843547098338604, "rewards/margins_std": 1.254135012626648, "rewards/rejected": -2.6762025356292725, "step": 1870 }, { "epoch": 0.49, "grad_norm": 9.860112037254295, "learning_rate": 3.0036793954375358e-06, "logits/chosen": -1.8950316905975342, "logits/rejected": -1.825010895729065, "logps/chosen": -402.56878662109375, "logps/rejected": -473.0184631347656, "loss": 0.4706, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2820751667022705, "rewards/margins": 0.8645330667495728, "rewards/margins_max": 2.3429512977600098, "rewards/margins_min": -0.19641318917274475, "rewards/margins_std": 1.116512417793274, "rewards/rejected": -2.1466078758239746, "step": 1880 }, { "epoch": 0.49, "grad_norm": 7.676741849084185, "learning_rate": 2.981282499033009e-06, "logits/chosen": -1.875481367111206, "logits/rejected": -1.8652130365371704, "logps/chosen": -473.49114990234375, "logps/rejected": -604.216064453125, "loss": 0.5082, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7240492105484009, "rewards/margins": 0.8251044154167175, "rewards/margins_max": 2.401297092437744, "rewards/margins_min": -0.9119477272033691, "rewards/margins_std": 1.4936933517456055, "rewards/rejected": -2.5491535663604736, "step": 1890 }, { "epoch": 0.5, "grad_norm": 7.917575609998244, "learning_rate": 2.9588454156783163e-06, "logits/chosen": -1.968605637550354, "logits/rejected": -1.903201699256897, "logps/chosen": -427.133056640625, "logps/rejected": -489.70037841796875, "loss": 0.451, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4554038047790527, "rewards/margins": 1.13992178440094, "rewards/margins_max": 2.676328420639038, "rewards/margins_min": -0.40167751908302307, "rewards/margins_std": 1.3907471895217896, "rewards/rejected": -2.595325231552124, "step": 1900 }, { "epoch": 0.5, "eval_logits/chosen": -1.9342164993286133, "eval_logits/rejected": -1.888846516609192, "eval_logps/chosen": -436.0497741699219, "eval_logps/rejected": -519.394775390625, "eval_loss": 0.5089967250823975, "eval_rewards/accuracies": 0.7579365372657776, "eval_rewards/chosen": -1.5156242847442627, "eval_rewards/margins": 1.056895136833191, "eval_rewards/margins_max": 3.379024028778076, "eval_rewards/margins_min": -0.8482393026351929, "eval_rewards/margins_std": 1.417421579360962, "eval_rewards/rejected": -2.572519540786743, "eval_runtime": 223.7477, "eval_samples_per_second": 8.939, "eval_steps_per_second": 0.282, "step": 1900 }, { "epoch": 0.5, "grad_norm": 11.951470473727282, "learning_rate": 2.9363700188634597e-06, "logits/chosen": -1.913816213607788, "logits/rejected": -1.8632875680923462, "logps/chosen": -388.0117492675781, "logps/rejected": -477.66107177734375, "loss": 0.4369, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2964433431625366, "rewards/margins": 1.0936293601989746, "rewards/margins_max": 2.3861193656921387, "rewards/margins_min": 0.040119122713804245, "rewards/margins_std": 1.089243769645691, "rewards/rejected": -2.3900725841522217, "step": 1910 }, { "epoch": 0.5, "grad_norm": 7.7702976345562105, "learning_rate": 2.9138581852776053e-06, "logits/chosen": -1.7885897159576416, "logits/rejected": -1.7566359043121338, "logps/chosen": -471.40972900390625, "logps/rejected": -507.6678771972656, "loss": 0.4939, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9903974533081055, "rewards/margins": 1.0101864337921143, "rewards/margins_max": 2.7623047828674316, "rewards/margins_min": -0.7513373494148254, "rewards/margins_std": 1.536746859550476, "rewards/rejected": -3.000584363937378, "step": 1920 }, { "epoch": 0.51, "grad_norm": 15.41919342504355, "learning_rate": 2.8913117946523805e-06, "logits/chosen": -1.9547092914581299, "logits/rejected": -1.8003740310668945, "logps/chosen": -537.2085571289062, "logps/rejected": -540.8640747070312, "loss": 0.5218, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8153603076934814, "rewards/margins": 1.1342813968658447, "rewards/margins_max": 2.81640362739563, "rewards/margins_min": -0.13238458335399628, "rewards/margins_std": 1.3349709510803223, "rewards/rejected": -2.949641704559326, "step": 1930 }, { "epoch": 0.51, "grad_norm": 11.408026402767922, "learning_rate": 2.8687327296049126e-06, "logits/chosen": -1.9501157999038696, "logits/rejected": -1.9387619495391846, "logps/chosen": -437.38824462890625, "logps/rejected": -523.2933959960938, "loss": 0.5305, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.695988655090332, "rewards/margins": 0.9639110565185547, "rewards/margins_max": 1.9832130670547485, "rewards/margins_min": -0.03912997245788574, "rewards/margins_std": 0.9334037899971008, "rewards/rejected": -2.6598994731903076, "step": 1940 }, { "epoch": 0.51, "grad_norm": 5.1635871241342866, "learning_rate": 2.8461228754806376e-06, "logits/chosen": -1.9807565212249756, "logits/rejected": -1.9655866622924805, "logps/chosen": -467.4744567871094, "logps/rejected": -560.3955688476562, "loss": 0.4951, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.735856056213379, "rewards/margins": 1.0864145755767822, "rewards/margins_max": 2.4637255668640137, "rewards/margins_min": -0.2939053177833557, "rewards/margins_std": 1.2100708484649658, "rewards/rejected": -2.822270393371582, "step": 1950 }, { "epoch": 0.51, "grad_norm": 8.964349285110925, "learning_rate": 2.823484120195865e-06, "logits/chosen": -1.9513261318206787, "logits/rejected": -1.9111149311065674, "logps/chosen": -481.4459533691406, "logps/rejected": -565.4024658203125, "loss": 0.5736, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8487815856933594, "rewards/margins": 1.2589893341064453, "rewards/margins_max": 3.2273945808410645, "rewards/margins_min": -0.10945943742990494, "rewards/margins_std": 1.546722173690796, "rewards/rejected": -3.1077709197998047, "step": 1960 }, { "epoch": 0.52, "grad_norm": 9.080345105574354, "learning_rate": 2.8008183540801486e-06, "logits/chosen": -1.9995203018188477, "logits/rejected": -1.9794012308120728, "logps/chosen": -470.147705078125, "logps/rejected": -513.644775390625, "loss": 0.5575, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8080466985702515, "rewards/margins": 0.7458864450454712, "rewards/margins_max": 2.308722496032715, "rewards/margins_min": -0.49820470809936523, "rewards/margins_std": 1.2731711864471436, "rewards/rejected": -2.5539333820343018, "step": 1970 }, { "epoch": 0.52, "grad_norm": 6.451639886615069, "learning_rate": 2.7781274697184353e-06, "logits/chosen": -1.8836400508880615, "logits/rejected": -1.8073854446411133, "logps/chosen": -360.3659973144531, "logps/rejected": -460.66436767578125, "loss": 0.542, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4968997240066528, "rewards/margins": 0.9140516519546509, "rewards/margins_max": 2.1549148559570312, "rewards/margins_min": -0.15494570136070251, "rewards/margins_std": 1.0448931455612183, "rewards/rejected": -2.4109511375427246, "step": 1980 }, { "epoch": 0.52, "grad_norm": 9.404705541808578, "learning_rate": 2.7554133617930397e-06, "logits/chosen": -1.8926111459732056, "logits/rejected": -1.7235584259033203, "logps/chosen": -457.1878967285156, "logps/rejected": -538.1091918945312, "loss": 0.5308, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9232317209243774, "rewards/margins": 0.9673582315444946, "rewards/margins_max": 2.3475184440612793, "rewards/margins_min": -0.5622755289077759, "rewards/margins_std": 1.2696657180786133, "rewards/rejected": -2.890589952468872, "step": 1990 }, { "epoch": 0.52, "grad_norm": 9.059722264224373, "learning_rate": 2.7326779269254363e-06, "logits/chosen": -1.877616286277771, "logits/rejected": -1.8389135599136353, "logps/chosen": -400.8014221191406, "logps/rejected": -450.51806640625, "loss": 0.4879, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6471052169799805, "rewards/margins": 0.6019695997238159, "rewards/margins_max": 1.8697569370269775, "rewards/margins_min": -0.4710001051425934, "rewards/margins_std": 1.0697133541107178, "rewards/rejected": -2.249075174331665, "step": 2000 }, { "epoch": 0.52, "eval_logits/chosen": -1.8968966007232666, "eval_logits/rejected": -1.8468141555786133, "eval_logps/chosen": -468.8386535644531, "eval_logps/rejected": -548.391357421875, "eval_loss": 0.500255823135376, "eval_rewards/accuracies": 0.77182537317276, "eval_rewards/chosen": -1.8435132503509521, "eval_rewards/margins": 1.018972396850586, "eval_rewards/margins_max": 3.2173244953155518, "eval_rewards/margins_min": -0.9039703607559204, "eval_rewards/margins_std": 1.3682732582092285, "eval_rewards/rejected": -2.862485647201538, "eval_runtime": 224.3649, "eval_samples_per_second": 8.914, "eval_steps_per_second": 0.281, "step": 2000 }, { "epoch": 0.53, "grad_norm": 7.18041536877666, "learning_rate": 2.7099230635178954e-06, "logits/chosen": -1.7741920948028564, "logits/rejected": -1.799018144607544, "logps/chosen": -451.83197021484375, "logps/rejected": -515.1966552734375, "loss": 0.4627, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.93684983253479, "rewards/margins": 0.9333077669143677, "rewards/margins_max": 2.2292287349700928, "rewards/margins_min": -0.28906288743019104, "rewards/margins_std": 1.1816482543945312, "rewards/rejected": -2.8701577186584473, "step": 2010 }, { "epoch": 0.53, "grad_norm": 10.03603507975487, "learning_rate": 2.6871506715949608e-06, "logits/chosen": -1.9625186920166016, "logits/rejected": -1.8760147094726562, "logps/chosen": -459.5166931152344, "logps/rejected": -500.88970947265625, "loss": 0.5097, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5139414072036743, "rewards/margins": 1.1039379835128784, "rewards/margins_max": 2.9816195964813232, "rewards/margins_min": -0.3845113217830658, "rewards/margins_std": 1.4896509647369385, "rewards/rejected": -2.6178793907165527, "step": 2020 }, { "epoch": 0.53, "grad_norm": 9.076779815756824, "learning_rate": 2.6643626526448063e-06, "logits/chosen": -1.8707443475723267, "logits/rejected": -1.852481484413147, "logps/chosen": -496.5193786621094, "logps/rejected": -563.2705078125, "loss": 0.4834, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.101254463195801, "rewards/margins": 0.9028240442276001, "rewards/margins_max": 2.9161906242370605, "rewards/margins_min": -0.655781626701355, "rewards/margins_std": 1.5774762630462646, "rewards/rejected": -3.0040783882141113, "step": 2030 }, { "epoch": 0.53, "grad_norm": 14.391559119044508, "learning_rate": 2.6415609094604562e-06, "logits/chosen": -1.5736384391784668, "logits/rejected": -1.6241216659545898, "logps/chosen": -564.4074096679688, "logps/rejected": -594.875, "loss": 0.5001, "rewards/accuracies": 0.75, "rewards/chosen": -2.6478240489959717, "rewards/margins": 1.1799546480178833, "rewards/margins_max": 3.0921006202697754, "rewards/margins_min": -0.3414881229400635, "rewards/margins_std": 1.5059552192687988, "rewards/rejected": -3.8277785778045654, "step": 2040 }, { "epoch": 0.54, "grad_norm": 13.746130936472861, "learning_rate": 2.618747345980904e-06, "logits/chosen": -1.7731149196624756, "logits/rejected": -1.7693065404891968, "logps/chosen": -527.6044921875, "logps/rejected": -641.5843505859375, "loss": 0.5063, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.4656195640563965, "rewards/margins": 1.4169232845306396, "rewards/margins_max": 3.870797634124756, "rewards/margins_min": -0.6093031764030457, "rewards/margins_std": 1.9297654628753662, "rewards/rejected": -3.8825430870056152, "step": 2050 }, { "epoch": 0.54, "grad_norm": 9.339847834325749, "learning_rate": 2.595923867132136e-06, "logits/chosen": -1.8224531412124634, "logits/rejected": -1.7843637466430664, "logps/chosen": -557.9534301757812, "logps/rejected": -583.00537109375, "loss": 0.5349, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.4725193977355957, "rewards/margins": 0.7531574368476868, "rewards/margins_max": 2.011129856109619, "rewards/margins_min": -0.27217358350753784, "rewards/margins_std": 1.0149810314178467, "rewards/rejected": -3.2256767749786377, "step": 2060 }, { "epoch": 0.54, "grad_norm": 7.0684567877198825, "learning_rate": 2.5730923786680672e-06, "logits/chosen": -1.8552172183990479, "logits/rejected": -1.847517728805542, "logps/chosen": -435.16552734375, "logps/rejected": -564.4310302734375, "loss": 0.5317, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.056243896484375, "rewards/margins": 0.7900081872940063, "rewards/margins_max": 2.6913135051727295, "rewards/margins_min": -0.8589878082275391, "rewards/margins_std": 1.6270986795425415, "rewards/rejected": -2.846251964569092, "step": 2070 }, { "epoch": 0.54, "grad_norm": 7.954256887775171, "learning_rate": 2.5502547870114137e-06, "logits/chosen": -1.9329948425292969, "logits/rejected": -1.8765723705291748, "logps/chosen": -386.27471923828125, "logps/rejected": -466.9205017089844, "loss": 0.5138, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6281421184539795, "rewards/margins": 0.6732000112533569, "rewards/margins_max": 2.2187628746032715, "rewards/margins_min": -0.7329077124595642, "rewards/margins_std": 1.3421125411987305, "rewards/rejected": -2.301342248916626, "step": 2080 }, { "epoch": 0.55, "grad_norm": 7.552457841378579, "learning_rate": 2.527412999094507e-06, "logits/chosen": -1.8681936264038086, "logits/rejected": -1.856935739517212, "logps/chosen": -418.4267578125, "logps/rejected": -540.3372802734375, "loss": 0.4595, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.489180564880371, "rewards/margins": 0.9251190423965454, "rewards/margins_max": 2.4660072326660156, "rewards/margins_min": -0.545732319355011, "rewards/margins_std": 1.330453872680664, "rewards/rejected": -2.414299488067627, "step": 2090 }, { "epoch": 0.55, "grad_norm": 6.3949941608560374, "learning_rate": 2.504568922200064e-06, "logits/chosen": -1.93172287940979, "logits/rejected": -1.9483925104141235, "logps/chosen": -400.9689636230469, "logps/rejected": -450.297119140625, "loss": 0.4879, "rewards/accuracies": 0.75, "rewards/chosen": -1.5077165365219116, "rewards/margins": 0.8865013122558594, "rewards/margins_max": 2.2574405670166016, "rewards/margins_min": -0.3013097047805786, "rewards/margins_std": 1.1515529155731201, "rewards/rejected": -2.3942177295684814, "step": 2100 }, { "epoch": 0.55, "eval_logits/chosen": -1.947603464126587, "eval_logits/rejected": -1.902673602104187, "eval_logps/chosen": -451.57318115234375, "eval_logps/rejected": -539.3309936523438, "eval_loss": 0.5043761134147644, "eval_rewards/accuracies": 0.7579365372657776, "eval_rewards/chosen": -1.6708588600158691, "eval_rewards/margins": 1.1010230779647827, "eval_rewards/margins_max": 3.5671937465667725, "eval_rewards/margins_min": -0.8762584924697876, "eval_rewards/margins_std": 1.4851690530776978, "eval_rewards/rejected": -2.7718820571899414, "eval_runtime": 223.7781, "eval_samples_per_second": 8.937, "eval_steps_per_second": 0.282, "step": 2100 }, { "epoch": 0.55, "grad_norm": 6.3290177080591095, "learning_rate": 2.4817244638019333e-06, "logits/chosen": -1.897834062576294, "logits/rejected": -1.8672163486480713, "logps/chosen": -437.18115234375, "logps/rejected": -477.0833435058594, "loss": 0.5904, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7033793926239014, "rewards/margins": 0.8508732914924622, "rewards/margins_max": 2.627293825149536, "rewards/margins_min": -0.635917067527771, "rewards/margins_std": 1.4603224992752075, "rewards/rejected": -2.554252862930298, "step": 2110 }, { "epoch": 0.55, "grad_norm": 17.353845246060562, "learning_rate": 2.4588815314058155e-06, "logits/chosen": -1.930930495262146, "logits/rejected": -1.8796344995498657, "logps/chosen": -392.33343505859375, "logps/rejected": -552.1408081054688, "loss": 0.4539, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.4954102039337158, "rewards/margins": 1.4431922435760498, "rewards/margins_max": 3.2089970111846924, "rewards/margins_min": 0.1632373183965683, "rewards/margins_std": 1.3763304948806763, "rewards/rejected": -2.9386022090911865, "step": 2120 }, { "epoch": 0.56, "grad_norm": 19.618913423429333, "learning_rate": 2.4360420323899922e-06, "logits/chosen": -1.8711045980453491, "logits/rejected": -1.847334861755371, "logps/chosen": -429.6898498535156, "logps/rejected": -556.58740234375, "loss": 0.5325, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.209238052368164, "rewards/margins": 0.7704706788063049, "rewards/margins_max": 2.3539047241210938, "rewards/margins_min": -0.6908884048461914, "rewards/margins_std": 1.3556920289993286, "rewards/rejected": -2.979708433151245, "step": 2130 }, { "epoch": 0.56, "grad_norm": 8.435669651008004, "learning_rate": 2.4132078738460585e-06, "logits/chosen": -1.793500304222107, "logits/rejected": -1.7801828384399414, "logps/chosen": -518.79638671875, "logps/rejected": -633.902099609375, "loss": 0.4828, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.2830471992492676, "rewards/margins": 1.2683002948760986, "rewards/margins_max": 3.003143787384033, "rewards/margins_min": -0.22637836635112762, "rewards/margins_std": 1.4759464263916016, "rewards/rejected": -3.5513477325439453, "step": 2140 }, { "epoch": 0.56, "grad_norm": 11.936372553924988, "learning_rate": 2.3903809624196826e-06, "logits/chosen": -1.8386614322662354, "logits/rejected": -1.7959644794464111, "logps/chosen": -506.428466796875, "logps/rejected": -546.7638549804688, "loss": 0.4843, "rewards/accuracies": 0.75, "rewards/chosen": -2.2123425006866455, "rewards/margins": 1.1225645542144775, "rewards/margins_max": 2.8898348808288574, "rewards/margins_min": -0.45589056611061096, "rewards/margins_std": 1.46990966796875, "rewards/rejected": -3.334906816482544, "step": 2150 }, { "epoch": 0.57, "grad_norm": 7.205554345623547, "learning_rate": 2.3675632041513978e-06, "logits/chosen": -1.672745943069458, "logits/rejected": -1.6749321222305298, "logps/chosen": -486.14947509765625, "logps/rejected": -635.6672973632812, "loss": 0.5555, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.643583059310913, "rewards/margins": 1.3591830730438232, "rewards/margins_max": 3.6960933208465576, "rewards/margins_min": -0.2757329046726227, "rewards/margins_std": 1.776414155960083, "rewards/rejected": -4.0027666091918945, "step": 2160 }, { "epoch": 0.57, "grad_norm": 9.685723123333384, "learning_rate": 2.3447565043174533e-06, "logits/chosen": -1.6676769256591797, "logits/rejected": -1.6730334758758545, "logps/chosen": -480.760498046875, "logps/rejected": -606.898193359375, "loss": 0.4386, "rewards/accuracies": 0.75, "rewards/chosen": -2.2951340675354004, "rewards/margins": 1.3565239906311035, "rewards/margins_max": 3.837153911590576, "rewards/margins_min": -0.23780480027198792, "rewards/margins_std": 1.934841513633728, "rewards/rejected": -3.651658296585083, "step": 2170 }, { "epoch": 0.57, "grad_norm": 13.792441600837032, "learning_rate": 2.321962767270724e-06, "logits/chosen": -1.9358110427856445, "logits/rejected": -1.8609025478363037, "logps/chosen": -590.2156372070312, "logps/rejected": -606.8203125, "loss": 0.5519, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.495270013809204, "rewards/margins": 1.028873085975647, "rewards/margins_max": 2.5963938236236572, "rewards/margins_min": -0.5002802610397339, "rewards/margins_std": 1.3585991859436035, "rewards/rejected": -3.5241432189941406, "step": 2180 }, { "epoch": 0.57, "grad_norm": 9.676078511531834, "learning_rate": 2.299183896281692e-06, "logits/chosen": -1.9163233041763306, "logits/rejected": -1.9151203632354736, "logps/chosen": -583.948486328125, "logps/rejected": -734.7337646484375, "loss": 0.484, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.018031597137451, "rewards/margins": 1.0219669342041016, "rewards/margins_max": 3.3963630199432373, "rewards/margins_min": -1.1325465440750122, "rewards/margins_std": 1.9772894382476807, "rewards/rejected": -4.039998531341553, "step": 2190 }, { "epoch": 0.58, "grad_norm": 10.409170479828111, "learning_rate": 2.2764217933795297e-06, "logits/chosen": -1.8802111148834229, "logits/rejected": -1.754559874534607, "logps/chosen": -665.6866455078125, "logps/rejected": -725.1818237304688, "loss": 0.4949, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.011068820953369, "rewards/margins": 1.4384467601776123, "rewards/margins_max": 3.4641470909118652, "rewards/margins_min": -0.14405474066734314, "rewards/margins_std": 1.6066837310791016, "rewards/rejected": -4.4495158195495605, "step": 2200 }, { "epoch": 0.58, "eval_logits/chosen": -1.7793868780136108, "eval_logits/rejected": -1.722078800201416, "eval_logps/chosen": -605.309814453125, "eval_logps/rejected": -706.05126953125, "eval_loss": 0.4963851273059845, "eval_rewards/accuracies": 0.7777777910232544, "eval_rewards/chosen": -3.2082247734069824, "eval_rewards/margins": 1.2308599948883057, "eval_rewards/margins_max": 3.8909552097320557, "eval_rewards/margins_min": -1.0364949703216553, "eval_rewards/margins_std": 1.6389572620391846, "eval_rewards/rejected": -4.439084053039551, "eval_runtime": 223.999, "eval_samples_per_second": 8.929, "eval_steps_per_second": 0.281, "step": 2200 }, { "epoch": 0.58, "grad_norm": 8.64390460336023, "learning_rate": 2.2536783591932786e-06, "logits/chosen": -1.7629449367523193, "logits/rejected": -1.7550442218780518, "logps/chosen": -586.06787109375, "logps/rejected": -691.7399291992188, "loss": 0.473, "rewards/accuracies": 0.75, "rewards/chosen": -3.1987500190734863, "rewards/margins": 1.1622952222824097, "rewards/margins_max": 3.3749969005584717, "rewards/margins_min": -0.7685580253601074, "rewards/margins_std": 1.8538711071014404, "rewards/rejected": -4.3610453605651855, "step": 2210 }, { "epoch": 0.58, "grad_norm": 9.512802331759291, "learning_rate": 2.230955492793149e-06, "logits/chosen": -1.793631911277771, "logits/rejected": -1.7697986364364624, "logps/chosen": -665.8543701171875, "logps/rejected": -738.6544189453125, "loss": 0.558, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.440174102783203, "rewards/margins": 1.294777274131775, "rewards/margins_max": 3.398524522781372, "rewards/margins_min": -0.7837820053100586, "rewards/margins_std": 1.8292226791381836, "rewards/rejected": -4.734951496124268, "step": 2220 }, { "epoch": 0.58, "grad_norm": 10.359776001500686, "learning_rate": 2.208255091531947e-06, "logits/chosen": -1.7653499841690063, "logits/rejected": -1.7473958730697632, "logps/chosen": -572.9618530273438, "logps/rejected": -718.3302001953125, "loss": 0.5241, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.129748582839966, "rewards/margins": 1.316284418106079, "rewards/margins_max": 3.7096519470214844, "rewards/margins_min": -0.6207399368286133, "rewards/margins_std": 1.888008713722229, "rewards/rejected": -4.446033000946045, "step": 2230 }, { "epoch": 0.59, "grad_norm": 8.748003050585138, "learning_rate": 2.1855790508866435e-06, "logits/chosen": -1.837742567062378, "logits/rejected": -1.7599071264266968, "logps/chosen": -547.1070556640625, "logps/rejected": -591.6936645507812, "loss": 0.5649, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.5880513191223145, "rewards/margins": 0.9630716443061829, "rewards/margins_max": 2.767690658569336, "rewards/margins_min": -0.5868943929672241, "rewards/margins_std": 1.4996631145477295, "rewards/rejected": -3.5511226654052734, "step": 2240 }, { "epoch": 0.59, "grad_norm": 10.77394807641657, "learning_rate": 2.162929264300107e-06, "logits/chosen": -1.8438403606414795, "logits/rejected": -1.8424497842788696, "logps/chosen": -536.0184326171875, "logps/rejected": -598.9551391601562, "loss": 0.5161, "rewards/accuracies": 0.75, "rewards/chosen": -2.484246015548706, "rewards/margins": 1.0834156274795532, "rewards/margins_max": 2.664710760116577, "rewards/margins_min": -0.3716050386428833, "rewards/margins_std": 1.3771653175354004, "rewards/rejected": -3.567661762237549, "step": 2250 }, { "epoch": 0.59, "grad_norm": 9.178289454710672, "learning_rate": 2.1403076230230006e-06, "logits/chosen": -1.8396503925323486, "logits/rejected": -1.7743980884552002, "logps/chosen": -457.23345947265625, "logps/rejected": -532.2266845703125, "loss": 0.4912, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.158998489379883, "rewards/margins": 0.8472871780395508, "rewards/margins_max": 2.085463762283325, "rewards/margins_min": -0.24042072892189026, "rewards/margins_std": 1.056386947631836, "rewards/rejected": -3.0062851905822754, "step": 2260 }, { "epoch": 0.59, "grad_norm": 6.430644115976088, "learning_rate": 2.11771601595586e-06, "logits/chosen": -1.867622971534729, "logits/rejected": -1.7660083770751953, "logps/chosen": -497.2547302246094, "logps/rejected": -605.922119140625, "loss": 0.4731, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.338486909866333, "rewards/margins": 1.1574230194091797, "rewards/margins_max": 2.6994240283966064, "rewards/margins_min": -0.5049154758453369, "rewards/margins_std": 1.4566012620925903, "rewards/rejected": -3.4959099292755127, "step": 2270 }, { "epoch": 0.6, "grad_norm": 9.606525873356546, "learning_rate": 2.0951563294913737e-06, "logits/chosen": -1.9024595022201538, "logits/rejected": -1.8611414432525635, "logps/chosen": -520.485107421875, "logps/rejected": -652.41943359375, "loss": 0.4821, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.6090407371520996, "rewards/margins": 1.2652584314346313, "rewards/margins_max": 3.0792763233184814, "rewards/margins_min": -0.17694154381752014, "rewards/margins_std": 1.4624518156051636, "rewards/rejected": -3.8742992877960205, "step": 2280 }, { "epoch": 0.6, "grad_norm": 9.817270467827427, "learning_rate": 2.0726304473568693e-06, "logits/chosen": -1.7594677209854126, "logits/rejected": -1.6445410251617432, "logps/chosen": -496.80255126953125, "logps/rejected": -619.5770263671875, "loss": 0.5179, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7014451026916504, "rewards/margins": 1.3004566431045532, "rewards/margins_max": 3.133958101272583, "rewards/margins_min": -0.24251993000507355, "rewards/margins_std": 1.5186634063720703, "rewards/rejected": -4.001902103424072, "step": 2290 }, { "epoch": 0.6, "grad_norm": 4.744509887144706, "learning_rate": 2.050140250457023e-06, "logits/chosen": -1.7752516269683838, "logits/rejected": -1.7905902862548828, "logps/chosen": -521.9744873046875, "logps/rejected": -581.05615234375, "loss": 0.5796, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.660521984100342, "rewards/margins": 0.6906440258026123, "rewards/margins_max": 2.2872939109802246, "rewards/margins_min": -0.6655935049057007, "rewards/margins_std": 1.3252170085906982, "rewards/rejected": -3.3511664867401123, "step": 2300 }, { "epoch": 0.6, "eval_logits/chosen": -1.8422073125839233, "eval_logits/rejected": -1.7895538806915283, "eval_logps/chosen": -554.2051391601562, "eval_logps/rejected": -633.1115112304688, "eval_loss": 0.49897539615631104, "eval_rewards/accuracies": 0.7896825671195984, "eval_rewards/chosen": -2.6971774101257324, "eval_rewards/margins": 1.012508749961853, "eval_rewards/margins_max": 3.21997332572937, "eval_rewards/margins_min": -0.8780794143676758, "eval_rewards/margins_std": 1.3552417755126953, "eval_rewards/rejected": -3.709686517715454, "eval_runtime": 223.8301, "eval_samples_per_second": 8.935, "eval_steps_per_second": 0.281, "step": 2300 }, { "epoch": 0.6, "grad_norm": 10.01630369555954, "learning_rate": 2.0276876167168042e-06, "logits/chosen": -1.9062496423721313, "logits/rejected": -1.7946100234985352, "logps/chosen": -600.8902587890625, "logps/rejected": -609.3153076171875, "loss": 0.5016, "rewards/accuracies": 0.75, "rewards/chosen": -2.7362606525421143, "rewards/margins": 0.909152626991272, "rewards/margins_max": 2.4279086589813232, "rewards/margins_min": -0.40702682733535767, "rewards/margins_std": 1.295883297920227, "rewards/rejected": -3.6454129219055176, "step": 2310 }, { "epoch": 0.61, "grad_norm": 10.122271888444498, "learning_rate": 2.0052744209246682e-06, "logits/chosen": -1.799827218055725, "logits/rejected": -1.7454811334609985, "logps/chosen": -562.8942260742188, "logps/rejected": -630.7413330078125, "loss": 0.4904, "rewards/accuracies": 0.625, "rewards/chosen": -2.797306776046753, "rewards/margins": 0.8095881342887878, "rewards/margins_max": 2.618866443634033, "rewards/margins_min": -0.7540814280509949, "rewards/margins_std": 1.5205354690551758, "rewards/rejected": -3.6068947315216064, "step": 2320 }, { "epoch": 0.61, "grad_norm": 7.397973695615941, "learning_rate": 1.9829025345760127e-06, "logits/chosen": -1.8734874725341797, "logits/rejected": -1.8497989177703857, "logps/chosen": -564.4740600585938, "logps/rejected": -696.8545532226562, "loss": 0.4584, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.6428911685943604, "rewards/margins": 1.1481060981750488, "rewards/margins_max": 2.7928614616394043, "rewards/margins_min": -0.4474239945411682, "rewards/margins_std": 1.4305777549743652, "rewards/rejected": -3.790997266769409, "step": 2330 }, { "epoch": 0.61, "grad_norm": 6.031035582530906, "learning_rate": 1.9605738257169115e-06, "logits/chosen": -1.794083833694458, "logits/rejected": -1.7303495407104492, "logps/chosen": -671.5252075195312, "logps/rejected": -743.9724731445312, "loss": 0.4865, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.3779196739196777, "rewards/margins": 0.760689914226532, "rewards/margins_max": 2.4640345573425293, "rewards/margins_min": -0.873823344707489, "rewards/margins_std": 1.454201102256775, "rewards/rejected": -4.138609409332275, "step": 2340 }, { "epoch": 0.62, "grad_norm": 18.12584521197391, "learning_rate": 1.9382901587881275e-06, "logits/chosen": -1.7585132122039795, "logits/rejected": -1.7522485256195068, "logps/chosen": -611.8785400390625, "logps/rejected": -634.1657104492188, "loss": 0.6065, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.008549213409424, "rewards/margins": 0.7759478688240051, "rewards/margins_max": 2.2986416816711426, "rewards/margins_min": -0.4979016184806824, "rewards/margins_std": 1.2732738256454468, "rewards/rejected": -3.784497022628784, "step": 2350 }, { "epoch": 0.62, "grad_norm": 6.8102088365168845, "learning_rate": 1.916053394469437e-06, "logits/chosen": -1.724387764930725, "logits/rejected": -1.6554744243621826, "logps/chosen": -535.9744873046875, "logps/rejected": -548.0862426757812, "loss": 0.4863, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.4983162879943848, "rewards/margins": 0.8426491022109985, "rewards/margins_max": 2.3499655723571777, "rewards/margins_min": -0.4870535731315613, "rewards/margins_std": 1.2618153095245361, "rewards/rejected": -3.3409652709960938, "step": 2360 }, { "epoch": 0.62, "grad_norm": 9.24590974019475, "learning_rate": 1.8938653895242604e-06, "logits/chosen": -1.8013556003570557, "logits/rejected": -1.7526042461395264, "logps/chosen": -539.6033935546875, "logps/rejected": -586.5892944335938, "loss": 0.5438, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.828014612197876, "rewards/margins": 0.6742250323295593, "rewards/margins_max": 2.236769199371338, "rewards/margins_min": -0.8859254717826843, "rewards/margins_std": 1.4273836612701416, "rewards/rejected": -3.502239942550659, "step": 2370 }, { "epoch": 0.62, "grad_norm": 8.063134929880619, "learning_rate": 1.8717279966446267e-06, "logits/chosen": -1.7946360111236572, "logits/rejected": -1.7339853048324585, "logps/chosen": -541.4672241210938, "logps/rejected": -583.0847778320312, "loss": 0.5193, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.96260142326355, "rewards/margins": 0.7402364015579224, "rewards/margins_max": 2.3831615447998047, "rewards/margins_min": -0.7004754543304443, "rewards/margins_std": 1.452028751373291, "rewards/rejected": -3.7028377056121826, "step": 2380 }, { "epoch": 0.63, "grad_norm": 8.666529532105649, "learning_rate": 1.8496430642964698e-06, "logits/chosen": -1.810516357421875, "logits/rejected": -1.7885147333145142, "logps/chosen": -553.4658813476562, "logps/rejected": -670.6603393554688, "loss": 0.4714, "rewards/accuracies": 0.75, "rewards/chosen": -2.7073307037353516, "rewards/margins": 1.0841028690338135, "rewards/margins_max": 3.004697322845459, "rewards/margins_min": -0.5229413509368896, "rewards/margins_std": 1.5479271411895752, "rewards/rejected": -3.7914340496063232, "step": 2390 }, { "epoch": 0.63, "grad_norm": 5.674980721461955, "learning_rate": 1.827612436565286e-06, "logits/chosen": -1.6737396717071533, "logits/rejected": -1.5853664875030518, "logps/chosen": -599.1856079101562, "logps/rejected": -733.3807983398438, "loss": 0.5492, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2337844371795654, "rewards/margins": 1.08359694480896, "rewards/margins_max": 2.6933255195617676, "rewards/margins_min": -0.19332584738731384, "rewards/margins_std": 1.31035578250885, "rewards/rejected": -4.317381381988525, "step": 2400 }, { "epoch": 0.63, "eval_logits/chosen": -1.6768298149108887, "eval_logits/rejected": -1.617042899131775, "eval_logps/chosen": -631.1837768554688, "eval_logps/rejected": -712.3121948242188, "eval_loss": 0.49689507484436035, "eval_rewards/accuracies": 0.7777777910232544, "eval_rewards/chosen": -3.4669647216796875, "eval_rewards/margins": 1.0347282886505127, "eval_rewards/margins_max": 3.3130176067352295, "eval_rewards/margins_min": -0.9050403237342834, "eval_rewards/margins_std": 1.3961678743362427, "eval_rewards/rejected": -4.501692771911621, "eval_runtime": 223.772, "eval_samples_per_second": 8.938, "eval_steps_per_second": 0.282, "step": 2400 }, { "epoch": 0.63, "grad_norm": 14.17619296643088, "learning_rate": 1.8056379530021492e-06, "logits/chosen": -1.6100637912750244, "logits/rejected": -1.549525260925293, "logps/chosen": -661.2942504882812, "logps/rejected": -691.6043090820312, "loss": 0.4889, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.555372714996338, "rewards/margins": 0.7949284315109253, "rewards/margins_max": 2.798487663269043, "rewards/margins_min": -0.6581093072891235, "rewards/margins_std": 1.5440999269485474, "rewards/rejected": -4.350301265716553, "step": 2410 }, { "epoch": 0.63, "grad_norm": 7.696426295506764, "learning_rate": 1.7837214484701154e-06, "logits/chosen": -1.679632544517517, "logits/rejected": -1.5904133319854736, "logps/chosen": -705.5847778320312, "logps/rejected": -729.8447265625, "loss": 0.4952, "rewards/accuracies": 0.75, "rewards/chosen": -3.468238353729248, "rewards/margins": 0.8539142608642578, "rewards/margins_max": 2.3275365829467773, "rewards/margins_min": -0.7979949712753296, "rewards/margins_std": 1.3870662450790405, "rewards/rejected": -4.322152614593506, "step": 2420 }, { "epoch": 0.64, "grad_norm": 7.816764949134618, "learning_rate": 1.7618647529910043e-06, "logits/chosen": -1.6926149129867554, "logits/rejected": -1.6591085195541382, "logps/chosen": -622.9351806640625, "logps/rejected": -674.9368896484375, "loss": 0.5375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.306126117706299, "rewards/margins": 0.820782482624054, "rewards/margins_max": 2.3384017944335938, "rewards/margins_min": -0.3495040535926819, "rewards/margins_std": 1.222424030303955, "rewards/rejected": -4.126908302307129, "step": 2430 }, { "epoch": 0.64, "grad_norm": 11.965320065812856, "learning_rate": 1.7400696915925996e-06, "logits/chosen": -1.614381194114685, "logits/rejected": -1.5288515090942383, "logps/chosen": -602.1192626953125, "logps/rejected": -655.6165161132812, "loss": 0.4863, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0914385318756104, "rewards/margins": 0.8890897631645203, "rewards/margins_max": 2.5146360397338867, "rewards/margins_min": -0.4105199873447418, "rewards/margins_std": 1.3256951570510864, "rewards/rejected": -3.980527877807617, "step": 2440 }, { "epoch": 0.64, "grad_norm": 8.273082462578092, "learning_rate": 1.718338084156254e-06, "logits/chosen": -1.8074671030044556, "logits/rejected": -1.8247467279434204, "logps/chosen": -612.3919677734375, "logps/rejected": -734.3466796875, "loss": 0.5505, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.167001247406006, "rewards/margins": 0.7180094122886658, "rewards/margins_max": 2.4003820419311523, "rewards/margins_min": -0.7094007730484009, "rewards/margins_std": 1.442274808883667, "rewards/rejected": -3.8850104808807373, "step": 2450 }, { "epoch": 0.64, "grad_norm": 18.731187665426393, "learning_rate": 1.6966717452649372e-06, "logits/chosen": -1.666565179824829, "logits/rejected": -1.6667665243148804, "logps/chosen": -541.8519897460938, "logps/rejected": -657.0438842773438, "loss": 0.4646, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.970785617828369, "rewards/margins": 1.208904504776001, "rewards/margins_max": 3.222243547439575, "rewards/margins_min": -0.5427895188331604, "rewards/margins_std": 1.6539669036865234, "rewards/rejected": -4.179690361022949, "step": 2460 }, { "epoch": 0.65, "grad_norm": 5.877573301960757, "learning_rate": 1.6750724840517103e-06, "logits/chosen": -1.7284990549087524, "logits/rejected": -1.7214981317520142, "logps/chosen": -634.5133056640625, "logps/rejected": -721.4016723632812, "loss": 0.4995, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.934875011444092, "rewards/margins": 1.3128116130828857, "rewards/margins_max": 3.1986725330352783, "rewards/margins_min": -0.1934700310230255, "rewards/margins_std": 1.5272159576416016, "rewards/rejected": -4.247686386108398, "step": 2470 }, { "epoch": 0.65, "grad_norm": 13.384115090029825, "learning_rate": 1.6535421040486686e-06, "logits/chosen": -1.8477451801300049, "logits/rejected": -1.80193293094635, "logps/chosen": -602.8226318359375, "logps/rejected": -631.3554077148438, "loss": 0.4964, "rewards/accuracies": 0.75, "rewards/chosen": -2.9916718006134033, "rewards/margins": 0.8356207013130188, "rewards/margins_max": 2.185746431350708, "rewards/margins_min": -0.6384302973747253, "rewards/margins_std": 1.2795166969299316, "rewards/rejected": -3.8272922039031982, "step": 2480 }, { "epoch": 0.65, "grad_norm": 8.559331822921857, "learning_rate": 1.6320824030363458e-06, "logits/chosen": -1.5903794765472412, "logits/rejected": -1.5187129974365234, "logps/chosen": -610.0673828125, "logps/rejected": -706.4861450195312, "loss": 0.4101, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.7773256301879883, "rewards/margins": 1.624315619468689, "rewards/margins_max": 3.7116127014160156, "rewards/margins_min": -0.19847269356250763, "rewards/margins_std": 1.7820628881454468, "rewards/rejected": -4.401640892028809, "step": 2490 }, { "epoch": 0.65, "grad_norm": 13.187341481200708, "learning_rate": 1.6106951728936028e-06, "logits/chosen": -1.6185038089752197, "logits/rejected": -1.6205253601074219, "logps/chosen": -594.1221313476562, "logps/rejected": -762.8563232421875, "loss": 0.4667, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.3846652507781982, "rewards/margins": 1.3989508152008057, "rewards/margins_max": 3.4546706676483154, "rewards/margins_min": -0.4026867747306824, "rewards/margins_std": 1.6792147159576416, "rewards/rejected": -4.783616542816162, "step": 2500 }, { "epoch": 0.65, "eval_logits/chosen": -1.648980975151062, "eval_logits/rejected": -1.5864522457122803, "eval_logps/chosen": -643.1785278320312, "eval_logps/rejected": -751.5126342773438, "eval_loss": 0.5004372000694275, "eval_rewards/accuracies": 0.7817460298538208, "eval_rewards/chosen": -3.586912155151367, "eval_rewards/margins": 1.3067864179611206, "eval_rewards/margins_max": 4.140169620513916, "eval_rewards/margins_min": -1.0666066408157349, "eval_rewards/margins_std": 1.7417904138565063, "eval_rewards/rejected": -4.893698692321777, "eval_runtime": 223.7376, "eval_samples_per_second": 8.939, "eval_steps_per_second": 0.282, "step": 2500 }, { "epoch": 0.66, "grad_norm": 8.220099160740236, "learning_rate": 1.5893821994479996e-06, "logits/chosen": -1.4632102251052856, "logits/rejected": -1.39264976978302, "logps/chosen": -637.5317993164062, "logps/rejected": -764.3389282226562, "loss": 0.4615, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.8055801391601562, "rewards/margins": 1.4307246208190918, "rewards/margins_max": 3.444718837738037, "rewards/margins_min": -0.5087088346481323, "rewards/margins_std": 1.773031234741211, "rewards/rejected": -5.23630428314209, "step": 2510 }, { "epoch": 0.66, "grad_norm": 19.38717091894107, "learning_rate": 1.5681452623266868e-06, "logits/chosen": -1.605120062828064, "logits/rejected": -1.5043137073516846, "logps/chosen": -641.0545043945312, "logps/rejected": -793.5194091796875, "loss": 0.5487, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.7998619079589844, "rewards/margins": 1.7778517007827759, "rewards/margins_max": 4.009039878845215, "rewards/margins_min": -0.3757677972316742, "rewards/margins_std": 1.9792906045913696, "rewards/rejected": -5.577713966369629, "step": 2520 }, { "epoch": 0.66, "grad_norm": 6.746881965750226, "learning_rate": 1.5469861348078014e-06, "logits/chosen": -1.689014196395874, "logits/rejected": -1.7171109914779663, "logps/chosen": -610.9246215820312, "logps/rejected": -767.1832275390625, "loss": 0.4454, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.270087718963623, "rewards/margins": 1.442313551902771, "rewards/margins_max": 3.706516742706299, "rewards/margins_min": -0.17674295604228973, "rewards/margins_std": 1.747992753982544, "rewards/rejected": -4.712401390075684, "step": 2530 }, { "epoch": 0.66, "grad_norm": 10.469678421984394, "learning_rate": 1.5259065836724035e-06, "logits/chosen": -1.6529514789581299, "logits/rejected": -1.6404564380645752, "logps/chosen": -584.8947143554688, "logps/rejected": -726.0347900390625, "loss": 0.4837, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.9987447261810303, "rewards/margins": 1.0250074863433838, "rewards/margins_max": 2.9397072792053223, "rewards/margins_min": -0.7015265226364136, "rewards/margins_std": 1.5828959941864014, "rewards/rejected": -4.023752689361572, "step": 2540 }, { "epoch": 0.67, "grad_norm": 9.601710294326312, "learning_rate": 1.5049083690569456e-06, "logits/chosen": -1.6596221923828125, "logits/rejected": -1.6530039310455322, "logps/chosen": -539.2584838867188, "logps/rejected": -656.8763427734375, "loss": 0.4665, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.646554470062256, "rewards/margins": 1.1216983795166016, "rewards/margins_max": 2.779221534729004, "rewards/margins_min": -0.3609750270843506, "rewards/margins_std": 1.4098312854766846, "rewards/rejected": -3.7682528495788574, "step": 2550 }, { "epoch": 0.67, "grad_norm": 43.1869912234865, "learning_rate": 1.4839932443063057e-06, "logits/chosen": -1.4627584218978882, "logits/rejected": -1.3543694019317627, "logps/chosen": -527.3870849609375, "logps/rejected": -689.0338134765625, "loss": 0.4649, "rewards/accuracies": 0.75, "rewards/chosen": -3.2730720043182373, "rewards/margins": 1.717287302017212, "rewards/margins_max": 4.287243843078613, "rewards/margins_min": -0.4691585600376129, "rewards/margins_std": 2.157824993133545, "rewards/rejected": -4.990359306335449, "step": 2560 }, { "epoch": 0.67, "grad_norm": 15.821391745594891, "learning_rate": 1.4631629558273803e-06, "logits/chosen": -1.745295524597168, "logits/rejected": -1.673452377319336, "logps/chosen": -603.0993041992188, "logps/rejected": -731.6232299804688, "loss": 0.5457, "rewards/accuracies": 0.75, "rewards/chosen": -3.175670623779297, "rewards/margins": 1.362243413925171, "rewards/margins_max": 3.482689619064331, "rewards/margins_min": -0.6397066116333008, "rewards/margins_std": 1.8990733623504639, "rewards/rejected": -4.537913799285889, "step": 2570 }, { "epoch": 0.68, "grad_norm": 14.895758497499223, "learning_rate": 1.4424192429432657e-06, "logits/chosen": -1.778455376625061, "logits/rejected": -1.72928786277771, "logps/chosen": -512.3264770507812, "logps/rejected": -683.111572265625, "loss": 0.4482, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.436824083328247, "rewards/margins": 1.6381092071533203, "rewards/margins_max": 3.8671488761901855, "rewards/margins_min": -0.26642045378685, "rewards/margins_std": 1.8812748193740845, "rewards/rejected": -4.074933052062988, "step": 2580 }, { "epoch": 0.68, "grad_norm": 14.826274149339314, "learning_rate": 1.421763837748016e-06, "logits/chosen": -1.7284568548202515, "logits/rejected": -1.6716560125350952, "logps/chosen": -587.1439819335938, "logps/rejected": -640.3153686523438, "loss": 0.5416, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8739173412323, "rewards/margins": 1.1328890323638916, "rewards/margins_max": 3.5089080333709717, "rewards/margins_min": -0.863398551940918, "rewards/margins_std": 2.02476167678833, "rewards/rejected": -4.006806373596191, "step": 2590 }, { "epoch": 0.68, "grad_norm": 5.3847061170542085, "learning_rate": 1.401198464962021e-06, "logits/chosen": -1.6797735691070557, "logits/rejected": -1.729544997215271, "logps/chosen": -499.35540771484375, "logps/rejected": -591.1624755859375, "loss": 0.5777, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.6993186473846436, "rewards/margins": 0.5869218111038208, "rewards/margins_max": 2.535921573638916, "rewards/margins_min": -1.3118431568145752, "rewards/margins_std": 1.682206392288208, "rewards/rejected": -3.286240339279175, "step": 2600 }, { "epoch": 0.68, "eval_logits/chosen": -1.7948791980743408, "eval_logits/rejected": -1.7399200201034546, "eval_logps/chosen": -524.6261596679688, "eval_logps/rejected": -615.5330200195312, "eval_loss": 0.4974254071712494, "eval_rewards/accuracies": 0.761904776096344, "eval_rewards/chosen": -2.401388168334961, "eval_rewards/margins": 1.1325136423110962, "eval_rewards/margins_max": 3.506291627883911, "eval_rewards/margins_min": -0.9035375714302063, "eval_rewards/margins_std": 1.4860153198242188, "eval_rewards/rejected": -3.5339019298553467, "eval_runtime": 223.7554, "eval_samples_per_second": 8.938, "eval_steps_per_second": 0.282, "step": 2600 }, { "epoch": 0.68, "grad_norm": 12.4989513720308, "learning_rate": 1.3807248417879896e-06, "logits/chosen": -1.7821886539459229, "logits/rejected": -1.6981433629989624, "logps/chosen": -475.63153076171875, "logps/rejected": -483.95367431640625, "loss": 0.5224, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.143188953399658, "rewards/margins": 0.8126838803291321, "rewards/margins_max": 2.5126216411590576, "rewards/margins_min": -0.5582553744316101, "rewards/margins_std": 1.3672950267791748, "rewards/rejected": -2.9558730125427246, "step": 2610 }, { "epoch": 0.69, "grad_norm": 11.596455631059216, "learning_rate": 1.3603446777675665e-06, "logits/chosen": -1.6486494541168213, "logits/rejected": -1.645790457725525, "logps/chosen": -529.7800903320312, "logps/rejected": -628.7935791015625, "loss": 0.5625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.7899270057678223, "rewards/margins": 0.8751605153083801, "rewards/margins_max": 3.164823055267334, "rewards/margins_min": -0.9512839317321777, "rewards/margins_std": 1.8637739419937134, "rewards/rejected": -3.6650872230529785, "step": 2620 }, { "epoch": 0.69, "grad_norm": 9.09089872879825, "learning_rate": 1.3400596746385817e-06, "logits/chosen": -1.7600370645523071, "logits/rejected": -1.7705967426300049, "logps/chosen": -488.9796447753906, "logps/rejected": -557.3919677734375, "loss": 0.52, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.316655158996582, "rewards/margins": 0.7720333337783813, "rewards/margins_max": 2.3772032260894775, "rewards/margins_min": -0.6269299983978271, "rewards/margins_std": 1.315580129623413, "rewards/rejected": -3.088689088821411, "step": 2630 }, { "epoch": 0.69, "grad_norm": 11.188045211378771, "learning_rate": 1.3198715261929587e-06, "logits/chosen": -1.8741172552108765, "logits/rejected": -1.8811876773834229, "logps/chosen": -568.7373046875, "logps/rejected": -598.1157836914062, "loss": 0.4564, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.5282630920410156, "rewards/margins": 0.5198124647140503, "rewards/margins_max": 2.035139560699463, "rewards/margins_min": -0.5814923048019409, "rewards/margins_std": 1.1748483180999756, "rewards/rejected": -3.0480754375457764, "step": 2640 }, { "epoch": 0.69, "grad_norm": 8.491104857104046, "learning_rate": 1.2997819181352823e-06, "logits/chosen": -1.7905391454696655, "logits/rejected": -1.666648507118225, "logps/chosen": -482.5135803222656, "logps/rejected": -640.7384033203125, "loss": 0.3936, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.3767693042755127, "rewards/margins": 1.8853709697723389, "rewards/margins_max": 4.066248416900635, "rewards/margins_min": 0.27001017332077026, "rewards/margins_std": 1.7164947986602783, "rewards/rejected": -4.262140274047852, "step": 2650 }, { "epoch": 0.7, "grad_norm": 8.21507221508268, "learning_rate": 1.2797925279420454e-06, "logits/chosen": -1.7390565872192383, "logits/rejected": -1.7045948505401611, "logps/chosen": -638.9325561523438, "logps/rejected": -676.6422119140625, "loss": 0.4989, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.088590145111084, "rewards/margins": 1.1583211421966553, "rewards/margins_max": 3.015636444091797, "rewards/margins_min": -0.4094787538051605, "rewards/margins_std": 1.5519349575042725, "rewards/rejected": -4.246911525726318, "step": 2660 }, { "epoch": 0.7, "grad_norm": 15.890215453025005, "learning_rate": 1.2599050247215764e-06, "logits/chosen": -1.7949869632720947, "logits/rejected": -1.6761634349822998, "logps/chosen": -638.4456787109375, "logps/rejected": -664.9205322265625, "loss": 0.5011, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.4901108741760254, "rewards/margins": 0.8229379653930664, "rewards/margins_max": 2.494372606277466, "rewards/margins_min": -0.629460334777832, "rewards/margins_std": 1.347444772720337, "rewards/rejected": -4.31304931640625, "step": 2670 }, { "epoch": 0.7, "grad_norm": 9.92173274381287, "learning_rate": 1.2401210690746705e-06, "logits/chosen": -1.8374378681182861, "logits/rejected": -1.785504937171936, "logps/chosen": -659.3348388671875, "logps/rejected": -838.7967529296875, "loss": 0.5057, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1054844856262207, "rewards/margins": 1.5101269483566284, "rewards/margins_max": 3.4319984912872314, "rewards/margins_min": -0.3490764796733856, "rewards/margins_std": 1.7290360927581787, "rewards/rejected": -4.6156110763549805, "step": 2680 }, { "epoch": 0.7, "grad_norm": 8.29047429953969, "learning_rate": 1.2204423129559306e-06, "logits/chosen": -1.5949240922927856, "logits/rejected": -1.5438302755355835, "logps/chosen": -461.20257568359375, "logps/rejected": -651.9647216796875, "loss": 0.5037, "rewards/accuracies": 0.75, "rewards/chosen": -2.6225733757019043, "rewards/margins": 1.5829538106918335, "rewards/margins_max": 3.958186388015747, "rewards/margins_min": -0.25734782218933105, "rewards/margins_std": 1.9294646978378296, "rewards/rejected": -4.205527305603027, "step": 2690 }, { "epoch": 0.71, "grad_norm": 8.94755873817249, "learning_rate": 1.20087039953583e-06, "logits/chosen": -1.7123409509658813, "logits/rejected": -1.640838623046875, "logps/chosen": -537.6624145507812, "logps/rejected": -610.6798095703125, "loss": 0.5021, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.9066245555877686, "rewards/margins": 0.8452857732772827, "rewards/margins_max": 2.763786792755127, "rewards/margins_min": -0.7039943933486938, "rewards/margins_std": 1.590510368347168, "rewards/rejected": -3.7519099712371826, "step": 2700 }, { "epoch": 0.71, "eval_logits/chosen": -1.798783302307129, "eval_logits/rejected": -1.7427352666854858, "eval_logps/chosen": -550.4240112304688, "eval_logps/rejected": -643.9044799804688, "eval_loss": 0.4927297532558441, "eval_rewards/accuracies": 0.7797619104385376, "eval_rewards/chosen": -2.659365653991699, "eval_rewards/margins": 1.1582502126693726, "eval_rewards/margins_max": 3.6118555068969727, "eval_rewards/margins_min": -0.9273214340209961, "eval_rewards/margins_std": 1.511755347251892, "eval_rewards/rejected": -3.8176157474517822, "eval_runtime": 223.7786, "eval_samples_per_second": 8.937, "eval_steps_per_second": 0.282, "step": 2700 }, { "epoch": 0.71, "grad_norm": 9.132039738211471, "learning_rate": 1.181406963063507e-06, "logits/chosen": -1.9388900995254517, "logits/rejected": -1.8724075555801392, "logps/chosen": -528.9666748046875, "logps/rejected": -602.7315673828125, "loss": 0.5005, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.4224541187286377, "rewards/margins": 0.8401997685432434, "rewards/margins_max": 2.2993102073669434, "rewards/margins_min": -0.40038639307022095, "rewards/margins_std": 1.1814854145050049, "rewards/rejected": -3.2626540660858154, "step": 2710 }, { "epoch": 0.71, "grad_norm": 10.461577096065108, "learning_rate": 1.1620536287303052e-06, "logits/chosen": -1.8010860681533813, "logits/rejected": -1.7487695217132568, "logps/chosen": -488.7469787597656, "logps/rejected": -605.3674926757812, "loss": 0.4508, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.678387403488159, "rewards/margins": 1.00853431224823, "rewards/margins_max": 2.4045920372009277, "rewards/margins_min": -0.20002785325050354, "rewards/margins_std": 1.1775511503219604, "rewards/rejected": -3.686922073364258, "step": 2720 }, { "epoch": 0.71, "grad_norm": 9.255189831567982, "learning_rate": 1.1428120125340717e-06, "logits/chosen": -1.6700589656829834, "logits/rejected": -1.6555038690567017, "logps/chosen": -580.2583618164062, "logps/rejected": -665.9177856445312, "loss": 0.5136, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.4526352882385254, "rewards/margins": 1.1402947902679443, "rewards/margins_max": 2.8333468437194824, "rewards/margins_min": -0.364024817943573, "rewards/margins_std": 1.4530102014541626, "rewards/rejected": -3.592930316925049, "step": 2730 }, { "epoch": 0.72, "grad_norm": 13.18346932811658, "learning_rate": 1.123683721144223e-06, "logits/chosen": -1.783735990524292, "logits/rejected": -1.790966272354126, "logps/chosen": -510.520263671875, "logps/rejected": -649.2190551757812, "loss": 0.568, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.867238998413086, "rewards/margins": 1.0030138492584229, "rewards/margins_max": 2.987144947052002, "rewards/margins_min": -0.4442862570285797, "rewards/margins_std": 1.552220344543457, "rewards/rejected": -3.870253086090088, "step": 2740 }, { "epoch": 0.72, "grad_norm": 7.430665025073042, "learning_rate": 1.1046703517675848e-06, "logits/chosen": -1.7778904438018799, "logits/rejected": -1.7467219829559326, "logps/chosen": -554.9721069335938, "logps/rejected": -625.0889282226562, "loss": 0.5419, "rewards/accuracies": 0.75, "rewards/chosen": -2.7956855297088623, "rewards/margins": 1.134925365447998, "rewards/margins_max": 2.823403835296631, "rewards/margins_min": -0.4319306015968323, "rewards/margins_std": 1.54230797290802, "rewards/rejected": -3.9306106567382812, "step": 2750 }, { "epoch": 0.72, "grad_norm": 7.398320971561195, "learning_rate": 1.085773492015028e-06, "logits/chosen": -1.830636978149414, "logits/rejected": -1.6999847888946533, "logps/chosen": -631.6650390625, "logps/rejected": -667.5723266601562, "loss": 0.4824, "rewards/accuracies": 0.75, "rewards/chosen": -2.6147685050964355, "rewards/margins": 1.1139171123504639, "rewards/margins_max": 2.9078335762023926, "rewards/margins_min": -0.3242526054382324, "rewards/margins_std": 1.4677479267120361, "rewards/rejected": -3.7286853790283203, "step": 2760 }, { "epoch": 0.72, "grad_norm": 10.688949397447413, "learning_rate": 1.0669947197689034e-06, "logits/chosen": -1.8538596630096436, "logits/rejected": -1.7625200748443604, "logps/chosen": -608.8543701171875, "logps/rejected": -689.6722412109375, "loss": 0.521, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0202476978302, "rewards/margins": 1.1823759078979492, "rewards/margins_max": 3.192807912826538, "rewards/margins_min": -0.29976776242256165, "rewards/margins_std": 1.555111289024353, "rewards/rejected": -4.2026238441467285, "step": 2770 }, { "epoch": 0.73, "grad_norm": 11.09035420628658, "learning_rate": 1.048335603051291e-06, "logits/chosen": -1.671316146850586, "logits/rejected": -1.594208836555481, "logps/chosen": -645.6860961914062, "logps/rejected": -641.4315795898438, "loss": 0.5044, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0239338874816895, "rewards/margins": 1.0173701047897339, "rewards/margins_max": 2.5271613597869873, "rewards/margins_min": -0.36355680227279663, "rewards/margins_std": 1.2949140071868896, "rewards/rejected": -4.041304111480713, "step": 2780 }, { "epoch": 0.73, "grad_norm": 8.6694394102214, "learning_rate": 1.0297976998930665e-06, "logits/chosen": -1.883152723312378, "logits/rejected": -1.7518794536590576, "logps/chosen": -656.6826171875, "logps/rejected": -707.6712036132812, "loss": 0.4583, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.9542698860168457, "rewards/margins": 1.2434988021850586, "rewards/margins_max": 3.0474817752838135, "rewards/margins_min": -0.1256110966205597, "rewards/margins_std": 1.3922723531723022, "rewards/rejected": -4.197768688201904, "step": 2790 }, { "epoch": 0.73, "grad_norm": 11.403546072915626, "learning_rate": 1.0113825582038078e-06, "logits/chosen": -1.6805000305175781, "logits/rejected": -1.659209966659546, "logps/chosen": -595.3116455078125, "logps/rejected": -674.968017578125, "loss": 0.5332, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.5431320667266846, "rewards/margins": 0.8787435293197632, "rewards/margins_max": 3.0597167015075684, "rewards/margins_min": -0.6807445287704468, "rewards/margins_std": 1.670896291732788, "rewards/rejected": -4.421875953674316, "step": 2800 }, { "epoch": 0.73, "eval_logits/chosen": -1.7143610715866089, "eval_logits/rejected": -1.6555330753326416, "eval_logps/chosen": -608.6549072265625, "eval_logps/rejected": -705.573486328125, "eval_loss": 0.49046725034713745, "eval_rewards/accuracies": 0.7817460298538208, "eval_rewards/chosen": -3.2416746616363525, "eval_rewards/margins": 1.1926313638687134, "eval_rewards/margins_max": 3.715873956680298, "eval_rewards/margins_min": -0.9638537764549255, "eval_rewards/margins_std": 1.5555708408355713, "eval_rewards/rejected": -4.4343061447143555, "eval_runtime": 223.8319, "eval_samples_per_second": 8.935, "eval_steps_per_second": 0.281, "step": 2800 }, { "epoch": 0.74, "grad_norm": 10.189803348698947, "learning_rate": 9.930917156425477e-07, "logits/chosen": -1.6395725011825562, "logits/rejected": -1.6671879291534424, "logps/chosen": -542.2437133789062, "logps/rejected": -642.0250244140625, "loss": 0.4934, "rewards/accuracies": 0.75, "rewards/chosen": -3.0105113983154297, "rewards/margins": 1.300561785697937, "rewards/margins_max": 3.247196912765503, "rewards/margins_min": -0.4116292893886566, "rewards/margins_std": 1.6492509841918945, "rewards/rejected": -4.311073303222656, "step": 2810 }, { "epoch": 0.74, "grad_norm": 5.688452336992407, "learning_rate": 9.749266994893756e-07, "logits/chosen": -1.577156662940979, "logits/rejected": -1.5383721590042114, "logps/chosen": -548.5317993164062, "logps/rejected": -714.7568969726562, "loss": 0.4303, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.071382999420166, "rewards/margins": 1.3945003747940063, "rewards/margins_max": 2.635091781616211, "rewards/margins_min": -0.2549123167991638, "rewards/margins_std": 1.2969977855682373, "rewards/rejected": -4.465882778167725, "step": 2820 }, { "epoch": 0.74, "grad_norm": 5.465893855170107, "learning_rate": 9.56889026517913e-07, "logits/chosen": -1.645247220993042, "logits/rejected": -1.6058517694473267, "logps/chosen": -619.1065673828125, "logps/rejected": -858.4378662109375, "loss": 0.4473, "rewards/accuracies": 0.75, "rewards/chosen": -3.3419601917266846, "rewards/margins": 0.9978305101394653, "rewards/margins_max": 2.9367451667785645, "rewards/margins_min": -0.5541468858718872, "rewards/margins_std": 1.6104196310043335, "rewards/rejected": -4.3397908210754395, "step": 2830 }, { "epoch": 0.74, "grad_norm": 8.345406699992592, "learning_rate": 9.389802028686617e-07, "logits/chosen": -1.6381679773330688, "logits/rejected": -1.600722074508667, "logps/chosen": -519.6800537109375, "logps/rejected": -603.8936767578125, "loss": 0.438, "rewards/accuracies": 0.75, "rewards/chosen": -3.1065008640289307, "rewards/margins": 1.0836912393569946, "rewards/margins_max": 2.470154047012329, "rewards/margins_min": -0.5804786682128906, "rewards/margins_std": 1.3780806064605713, "rewards/rejected": -4.190192222595215, "step": 2840 }, { "epoch": 0.75, "grad_norm": 7.867479154148856, "learning_rate": 9.212017239232427e-07, "logits/chosen": -1.7828384637832642, "logits/rejected": -1.787161111831665, "logps/chosen": -626.314453125, "logps/rejected": -727.2987060546875, "loss": 0.5008, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.455193281173706, "rewards/margins": 1.296221137046814, "rewards/margins_max": 3.330785036087036, "rewards/margins_min": -0.6085636019706726, "rewards/margins_std": 1.7467467784881592, "rewards/rejected": -4.7514142990112305, "step": 2850 }, { "epoch": 0.75, "grad_norm": 9.387553444213713, "learning_rate": 9.03555074179533e-07, "logits/chosen": -1.8300073146820068, "logits/rejected": -1.7626771926879883, "logps/chosen": -548.6365966796875, "logps/rejected": -631.5753173828125, "loss": 0.4461, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8477401733398438, "rewards/margins": 1.03937566280365, "rewards/margins_max": 2.8734934329986572, "rewards/margins_min": -0.6534374356269836, "rewards/margins_std": 1.5784457921981812, "rewards/rejected": -3.887115955352783, "step": 2860 }, { "epoch": 0.75, "grad_norm": 8.802838162592613, "learning_rate": 8.860417271277067e-07, "logits/chosen": -1.6445767879486084, "logits/rejected": -1.5424996614456177, "logps/chosen": -600.4591674804688, "logps/rejected": -731.9525756835938, "loss": 0.4253, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.1132984161376953, "rewards/margins": 1.5961964130401611, "rewards/margins_max": 3.9080538749694824, "rewards/margins_min": -0.1850893795490265, "rewards/margins_std": 1.8803846836090088, "rewards/rejected": -4.709494590759277, "step": 2870 }, { "epoch": 0.75, "grad_norm": 11.491877465438973, "learning_rate": 8.686631451272029e-07, "logits/chosen": -1.7812732458114624, "logits/rejected": -1.6650358438491821, "logps/chosen": -666.2443237304688, "logps/rejected": -753.0020751953125, "loss": 0.5095, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.642831802368164, "rewards/margins": 1.5096815824508667, "rewards/margins_max": 3.6814639568328857, "rewards/margins_min": -0.10333987325429916, "rewards/margins_std": 1.7144514322280884, "rewards/rejected": -5.152513027191162, "step": 2880 }, { "epoch": 0.76, "grad_norm": 5.417889012064191, "learning_rate": 8.514207792846168e-07, "logits/chosen": -1.683698296546936, "logits/rejected": -1.658581018447876, "logps/chosen": -674.7010498046875, "logps/rejected": -789.7808837890625, "loss": 0.4911, "rewards/accuracies": 0.625, "rewards/chosen": -3.910454511642456, "rewards/margins": 0.9134790301322937, "rewards/margins_max": 2.8000588417053223, "rewards/margins_min": -0.8952654004096985, "rewards/margins_std": 1.6592881679534912, "rewards/rejected": -4.8239336013793945, "step": 2890 }, { "epoch": 0.76, "grad_norm": 15.350641074446512, "learning_rate": 8.343160693325356e-07, "logits/chosen": -1.7998183965682983, "logits/rejected": -1.7519941329956055, "logps/chosen": -666.6056518554688, "logps/rejected": -748.1965942382812, "loss": 0.5514, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.6222548484802246, "rewards/margins": 1.3215632438659668, "rewards/margins_max": 3.638734817504883, "rewards/margins_min": -0.6989208459854126, "rewards/margins_std": 2.00874662399292, "rewards/rejected": -4.943818092346191, "step": 2900 }, { "epoch": 0.76, "eval_logits/chosen": -1.7301685810089111, "eval_logits/rejected": -1.6687203645706177, "eval_logps/chosen": -659.474853515625, "eval_logps/rejected": -766.1926879882812, "eval_loss": 0.49339574575424194, "eval_rewards/accuracies": 0.7797619104385376, "eval_rewards/chosen": -3.74987530708313, "eval_rewards/margins": 1.290623426437378, "eval_rewards/margins_max": 3.972259759902954, "eval_rewards/margins_min": -1.0907313823699951, "eval_rewards/margins_std": 1.6886576414108276, "eval_rewards/rejected": -5.040499210357666, "eval_runtime": 223.8658, "eval_samples_per_second": 8.934, "eval_steps_per_second": 0.281, "step": 2900 }, { "epoch": 0.76, "grad_norm": 13.137661884446526, "learning_rate": 8.173504435093174e-07, "logits/chosen": -1.687639594078064, "logits/rejected": -1.5560787916183472, "logps/chosen": -621.4246215820312, "logps/rejected": -726.8245849609375, "loss": 0.465, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.45202898979187, "rewards/margins": 1.5716876983642578, "rewards/margins_max": 3.5671792030334473, "rewards/margins_min": 0.07272086292505264, "rewards/margins_std": 1.552864670753479, "rewards/rejected": -5.023716449737549, "step": 2910 }, { "epoch": 0.76, "grad_norm": 15.184211280695381, "learning_rate": 8.00525318439836e-07, "logits/chosen": -1.6129789352416992, "logits/rejected": -1.536246657371521, "logps/chosen": -625.9220581054688, "logps/rejected": -669.4012451171875, "loss": 0.5071, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.4476122856140137, "rewards/margins": 1.1214309930801392, "rewards/margins_max": 3.2243123054504395, "rewards/margins_min": -0.458599328994751, "rewards/margins_std": 1.6714824438095093, "rewards/rejected": -4.5690436363220215, "step": 2920 }, { "epoch": 0.77, "grad_norm": 15.48676350145504, "learning_rate": 7.838420990171927e-07, "logits/chosen": -1.6580454111099243, "logits/rejected": -1.592472791671753, "logps/chosen": -624.5654907226562, "logps/rejected": -711.8115844726562, "loss": 0.54, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.1499695777893066, "rewards/margins": 1.5291677713394165, "rewards/margins_max": 3.403630495071411, "rewards/margins_min": 0.0019218921661376953, "rewards/margins_std": 1.4931795597076416, "rewards/rejected": -4.679137229919434, "step": 2930 }, { "epoch": 0.77, "grad_norm": 10.290894596519765, "learning_rate": 7.673021782854084e-07, "logits/chosen": -1.7301111221313477, "logits/rejected": -1.7685855627059937, "logps/chosen": -611.1260986328125, "logps/rejected": -686.8121337890625, "loss": 0.5348, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.105149030685425, "rewards/margins": 1.0561141967773438, "rewards/margins_max": 2.2322065830230713, "rewards/margins_min": -0.32790738344192505, "rewards/margins_std": 1.154937505722046, "rewards/rejected": -4.161263465881348, "step": 2940 }, { "epoch": 0.77, "grad_norm": 6.735258040097589, "learning_rate": 7.509069373231039e-07, "logits/chosen": -1.921818494796753, "logits/rejected": -1.8522140979766846, "logps/chosen": -560.2169189453125, "logps/rejected": -694.2904052734375, "loss": 0.419, "rewards/accuracies": 0.875, "rewards/chosen": -2.7602181434631348, "rewards/margins": 1.3098796606063843, "rewards/margins_max": 2.856182813644409, "rewards/margins_min": -0.4029567837715149, "rewards/margins_std": 1.4475106000900269, "rewards/rejected": -4.070097923278809, "step": 2950 }, { "epoch": 0.77, "grad_norm": 7.230332103820887, "learning_rate": 7.346577451281822e-07, "logits/chosen": -1.6510388851165771, "logits/rejected": -1.6492815017700195, "logps/chosen": -606.5361328125, "logps/rejected": -666.9208374023438, "loss": 0.456, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.4907431602478027, "rewards/margins": 0.7022908926010132, "rewards/margins_max": 2.4397664070129395, "rewards/margins_min": -0.778422474861145, "rewards/margins_std": 1.414158582687378, "rewards/rejected": -4.1930341720581055, "step": 2960 }, { "epoch": 0.78, "grad_norm": 17.33472808703853, "learning_rate": 7.185559585035138e-07, "logits/chosen": -1.758854627609253, "logits/rejected": -1.7159103155136108, "logps/chosen": -586.5679931640625, "logps/rejected": -629.2230224609375, "loss": 0.5726, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.5208516120910645, "rewards/margins": 0.7190099954605103, "rewards/margins_max": 2.6217122077941895, "rewards/margins_min": -0.4522368907928467, "rewards/margins_std": 1.3814265727996826, "rewards/rejected": -4.239861488342285, "step": 2970 }, { "epoch": 0.78, "grad_norm": 4.842746162991606, "learning_rate": 7.026029219436504e-07, "logits/chosen": -1.598104476928711, "logits/rejected": -1.5373762845993042, "logps/chosen": -616.1405029296875, "logps/rejected": -764.2574462890625, "loss": 0.4176, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.086430788040161, "rewards/margins": 1.4787911176681519, "rewards/margins_max": 3.5550620555877686, "rewards/margins_min": -0.3822387158870697, "rewards/margins_std": 1.7294639348983765, "rewards/rejected": -4.565222263336182, "step": 2980 }, { "epoch": 0.78, "grad_norm": 9.174494874442509, "learning_rate": 6.867999675225523e-07, "logits/chosen": -1.7317301034927368, "logits/rejected": -1.6322736740112305, "logps/chosen": -624.0198364257812, "logps/rejected": -784.1222534179688, "loss": 0.4695, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1624395847320557, "rewards/margins": 1.596806287765503, "rewards/margins_max": 3.049485683441162, "rewards/margins_min": -0.0005687117809429765, "rewards/margins_std": 1.333274006843567, "rewards/rejected": -4.759245872497559, "step": 2990 }, { "epoch": 0.79, "grad_norm": 4.6197017530474165, "learning_rate": 6.711484147823663e-07, "logits/chosen": -1.6648492813110352, "logits/rejected": -1.5978591442108154, "logps/chosen": -534.0223388671875, "logps/rejected": -714.30859375, "loss": 0.4162, "rewards/accuracies": 0.875, "rewards/chosen": -2.933379650115967, "rewards/margins": 1.5473244190216064, "rewards/margins_max": 3.8815484046936035, "rewards/margins_min": 0.12625472247600555, "rewards/margins_std": 1.7195415496826172, "rewards/rejected": -4.480703830718994, "step": 3000 }, { "epoch": 0.79, "eval_logits/chosen": -1.7208360433578491, "eval_logits/rejected": -1.660469651222229, "eval_logps/chosen": -612.6412963867188, "eval_logps/rejected": -707.239501953125, "eval_loss": 0.4917338490486145, "eval_rewards/accuracies": 0.7698412537574768, "eval_rewards/chosen": -3.28153920173645, "eval_rewards/margins": 1.1694272756576538, "eval_rewards/margins_max": 3.6486222743988037, "eval_rewards/margins_min": -0.9447228312492371, "eval_rewards/margins_std": 1.5323147773742676, "eval_rewards/rejected": -4.4509663581848145, "eval_runtime": 223.9076, "eval_samples_per_second": 8.932, "eval_steps_per_second": 0.281, "step": 3000 }, { "epoch": 0.79, "grad_norm": 6.657385787413926, "learning_rate": 6.556495706232413e-07, "logits/chosen": -1.694358468055725, "logits/rejected": -1.6283676624298096, "logps/chosen": -639.6585083007812, "logps/rejected": -732.4859619140625, "loss": 0.4398, "rewards/accuracies": 0.75, "rewards/chosen": -3.308572769165039, "rewards/margins": 1.4383642673492432, "rewards/margins_max": 3.6621170043945312, "rewards/margins_min": -0.4612976908683777, "rewards/margins_std": 1.7711632251739502, "rewards/rejected": -4.7469377517700195, "step": 3010 }, { "epoch": 0.79, "grad_norm": 8.440120035917921, "learning_rate": 6.403047291942057e-07, "logits/chosen": -1.5842583179473877, "logits/rejected": -1.5117440223693848, "logps/chosen": -576.341552734375, "logps/rejected": -648.1582641601562, "loss": 0.4661, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.3548312187194824, "rewards/margins": 1.196236252784729, "rewards/margins_max": 3.1001060009002686, "rewards/margins_min": -0.5854426622390747, "rewards/margins_std": 1.6197668313980103, "rewards/rejected": -4.55106782913208, "step": 3020 }, { "epoch": 0.79, "grad_norm": 5.954321232954672, "learning_rate": 6.251151717851023e-07, "logits/chosen": -1.6102873086929321, "logits/rejected": -1.612274408340454, "logps/chosen": -581.7318725585938, "logps/rejected": -805.5562744140625, "loss": 0.4888, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.5590877532958984, "rewards/margins": 1.5652964115142822, "rewards/margins_max": 3.545679807662964, "rewards/margins_min": -0.16699019074440002, "rewards/margins_std": 1.6482595205307007, "rewards/rejected": -5.124383926391602, "step": 3030 }, { "epoch": 0.8, "grad_norm": 6.845423942238685, "learning_rate": 6.100821667196041e-07, "logits/chosen": -1.6870015859603882, "logits/rejected": -1.586848497390747, "logps/chosen": -648.005859375, "logps/rejected": -834.7107543945312, "loss": 0.4457, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.454275131225586, "rewards/margins": 1.5839426517486572, "rewards/margins_max": 3.531031847000122, "rewards/margins_min": -0.06879498809576035, "rewards/margins_std": 1.5945451259613037, "rewards/rejected": -5.038217544555664, "step": 3040 }, { "epoch": 0.8, "grad_norm": 11.916635224271966, "learning_rate": 5.952069692493062e-07, "logits/chosen": -1.6910197734832764, "logits/rejected": -1.6241363286972046, "logps/chosen": -661.1782836914062, "logps/rejected": -779.5869140625, "loss": 0.4381, "rewards/accuracies": 0.75, "rewards/chosen": -3.639672040939331, "rewards/margins": 1.4021408557891846, "rewards/margins_max": 2.899613380432129, "rewards/margins_min": -0.19838783144950867, "rewards/margins_std": 1.3848707675933838, "rewards/rejected": -5.041812896728516, "step": 3050 }, { "epoch": 0.8, "grad_norm": 16.04441010767068, "learning_rate": 5.80490821448918e-07, "logits/chosen": -1.7209593057632446, "logits/rejected": -1.6796979904174805, "logps/chosen": -668.4605712890625, "logps/rejected": -747.0391845703125, "loss": 0.5081, "rewards/accuracies": 0.75, "rewards/chosen": -3.617810010910034, "rewards/margins": 1.017822265625, "rewards/margins_max": 3.0672640800476074, "rewards/margins_min": -0.43945759534835815, "rewards/margins_std": 1.6316171884536743, "rewards/rejected": -4.635632038116455, "step": 3060 }, { "epoch": 0.8, "grad_norm": 13.909453085288291, "learning_rate": 5.659349521125459e-07, "logits/chosen": -1.4908415079116821, "logits/rejected": -1.4603463411331177, "logps/chosen": -593.3381958007812, "logps/rejected": -669.8458862304688, "loss": 0.529, "rewards/accuracies": 0.75, "rewards/chosen": -3.4196362495422363, "rewards/margins": 1.0531947612762451, "rewards/margins_max": 2.665644884109497, "rewards/margins_min": -0.25924593210220337, "rewards/margins_std": 1.307367205619812, "rewards/rejected": -4.472830772399902, "step": 3070 }, { "epoch": 0.81, "grad_norm": 8.746682110206802, "learning_rate": 5.5154057665109e-07, "logits/chosen": -1.7000973224639893, "logits/rejected": -1.6720774173736572, "logps/chosen": -591.883056640625, "logps/rejected": -703.2059326171875, "loss": 0.5474, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.5170485973358154, "rewards/margins": 0.8415341377258301, "rewards/margins_max": 2.61421537399292, "rewards/margins_min": -0.9159469604492188, "rewards/margins_std": 1.6026989221572876, "rewards/rejected": -4.358582496643066, "step": 3080 }, { "epoch": 0.81, "grad_norm": 11.12265984428719, "learning_rate": 5.373088969907586e-07, "logits/chosen": -1.814846396446228, "logits/rejected": -1.647896409034729, "logps/chosen": -681.1114501953125, "logps/rejected": -733.7249755859375, "loss": 0.541, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.7214760780334473, "rewards/margins": 1.058547854423523, "rewards/margins_max": 2.9261717796325684, "rewards/margins_min": -0.805109977722168, "rewards/margins_std": 1.6681225299835205, "rewards/rejected": -4.78002405166626, "step": 3090 }, { "epoch": 0.81, "grad_norm": 15.28565288373484, "learning_rate": 5.23241101472709e-07, "logits/chosen": -1.778067946434021, "logits/rejected": -1.752916932106018, "logps/chosen": -627.4759521484375, "logps/rejected": -705.8504638671875, "loss": 0.5252, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.3213953971862793, "rewards/margins": 0.8665458559989929, "rewards/margins_max": 2.6683571338653564, "rewards/margins_min": -0.6539695858955383, "rewards/margins_std": 1.5414186716079712, "rewards/rejected": -4.187941074371338, "step": 3100 }, { "epoch": 0.81, "eval_logits/chosen": -1.753575086593628, "eval_logits/rejected": -1.6937494277954102, "eval_logps/chosen": -596.7129516601562, "eval_logps/rejected": -694.2786865234375, "eval_loss": 0.48965802788734436, "eval_rewards/accuracies": 0.7857142686843872, "eval_rewards/chosen": -3.12225604057312, "eval_rewards/margins": 1.1991024017333984, "eval_rewards/margins_max": 3.743117332458496, "eval_rewards/margins_min": -0.9576919674873352, "eval_rewards/margins_std": 1.5631747245788574, "eval_rewards/rejected": -4.3213582038879395, "eval_runtime": 223.8275, "eval_samples_per_second": 8.935, "eval_steps_per_second": 0.281, "step": 3100 }, { "epoch": 0.81, "grad_norm": 19.112760329734556, "learning_rate": 5.09338364753818e-07, "logits/chosen": -1.8033561706542969, "logits/rejected": -1.6905755996704102, "logps/chosen": -672.5972900390625, "logps/rejected": -732.3944091796875, "loss": 0.5562, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.304776668548584, "rewards/margins": 1.062970757484436, "rewards/margins_max": 3.135901927947998, "rewards/margins_min": -0.779354453086853, "rewards/margins_std": 1.7692826986312866, "rewards/rejected": -4.3677473068237305, "step": 3110 }, { "epoch": 0.82, "grad_norm": 12.135003122424402, "learning_rate": 4.956018477086005e-07, "logits/chosen": -1.7134888172149658, "logits/rejected": -1.6333507299423218, "logps/chosen": -591.4527587890625, "logps/rejected": -698.5405883789062, "loss": 0.4751, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.1621880531311035, "rewards/margins": 1.2333950996398926, "rewards/margins_max": 3.0839152336120605, "rewards/margins_min": -0.6085202097892761, "rewards/margins_std": 1.6102014780044556, "rewards/rejected": -4.395583152770996, "step": 3120 }, { "epoch": 0.82, "grad_norm": 7.422829656941266, "learning_rate": 4.820326973322764e-07, "logits/chosen": -1.684401273727417, "logits/rejected": -1.6537752151489258, "logps/chosen": -546.58740234375, "logps/rejected": -657.02880859375, "loss": 0.4587, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.877803087234497, "rewards/margins": 1.375436544418335, "rewards/margins_max": 3.4814834594726562, "rewards/margins_min": -0.2131095677614212, "rewards/margins_std": 1.6577033996582031, "rewards/rejected": -4.253239631652832, "step": 3130 }, { "epoch": 0.82, "grad_norm": 7.124074775453301, "learning_rate": 4.686320466449981e-07, "logits/chosen": -1.6122562885284424, "logits/rejected": -1.6471035480499268, "logps/chosen": -522.0559692382812, "logps/rejected": -655.3521728515625, "loss": 0.5108, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.9794363975524902, "rewards/margins": 1.434926152229309, "rewards/margins_max": 3.041318416595459, "rewards/margins_min": -0.07338912039995193, "rewards/margins_std": 1.384115219116211, "rewards/rejected": -4.41436243057251, "step": 3140 }, { "epoch": 0.82, "grad_norm": 10.43225111868576, "learning_rate": 4.554010145972418e-07, "logits/chosen": -1.7233202457427979, "logits/rejected": -1.7614465951919556, "logps/chosen": -618.3638916015625, "logps/rejected": -712.8865966796875, "loss": 0.492, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.2824034690856934, "rewards/margins": 0.9416518211364746, "rewards/margins_max": 3.389979839324951, "rewards/margins_min": -0.9954224824905396, "rewards/margins_std": 1.9700361490249634, "rewards/rejected": -4.224055290222168, "step": 3150 }, { "epoch": 0.83, "grad_norm": 11.422251725148428, "learning_rate": 4.4234070597637455e-07, "logits/chosen": -1.9060640335083008, "logits/rejected": -1.802366852760315, "logps/chosen": -610.2806396484375, "logps/rejected": -606.7326049804688, "loss": 0.4927, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.8033134937286377, "rewards/margins": 1.053274154663086, "rewards/margins_max": 2.7026009559631348, "rewards/margins_min": -0.36519497632980347, "rewards/margins_std": 1.4104167222976685, "rewards/rejected": -3.8565876483917236, "step": 3160 }, { "epoch": 0.83, "grad_norm": 6.639087023188116, "learning_rate": 4.2945221131440783e-07, "logits/chosen": -1.7615760564804077, "logits/rejected": -1.7342560291290283, "logps/chosen": -637.0568237304688, "logps/rejected": -693.208984375, "loss": 0.5139, "rewards/accuracies": 0.75, "rewards/chosen": -3.2184958457946777, "rewards/margins": 0.7963525056838989, "rewards/margins_max": 2.4779791831970215, "rewards/margins_min": -0.8524004220962524, "rewards/margins_std": 1.5168458223342896, "rewards/rejected": -4.014848232269287, "step": 3170 }, { "epoch": 0.83, "grad_norm": 18.28568238868656, "learning_rate": 4.167366067969381e-07, "logits/chosen": -1.654608964920044, "logits/rejected": -1.6509491205215454, "logps/chosen": -584.055908203125, "logps/rejected": -645.7567749023438, "loss": 0.5722, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.976588726043701, "rewards/margins": 1.062950611114502, "rewards/margins_max": 3.303220272064209, "rewards/margins_min": -0.5729750394821167, "rewards/margins_std": 1.7696577310562134, "rewards/rejected": -4.039539813995361, "step": 3180 }, { "epoch": 0.83, "grad_norm": 5.798955415515178, "learning_rate": 4.041949541732826e-07, "logits/chosen": -1.7014904022216797, "logits/rejected": -1.5435564517974854, "logps/chosen": -554.59765625, "logps/rejected": -657.4716186523438, "loss": 0.4619, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.075425624847412, "rewards/margins": 1.1840814352035522, "rewards/margins_max": 3.4662883281707764, "rewards/margins_min": -0.48689740896224976, "rewards/margins_std": 1.7387990951538086, "rewards/rejected": -4.259507179260254, "step": 3190 }, { "epoch": 0.84, "grad_norm": 9.537371561328001, "learning_rate": 3.9182830066782614e-07, "logits/chosen": -1.6401255130767822, "logits/rejected": -1.6383945941925049, "logps/chosen": -590.1411743164062, "logps/rejected": -656.6798706054688, "loss": 0.4626, "rewards/accuracies": 0.75, "rewards/chosen": -3.1389565467834473, "rewards/margins": 0.9910252690315247, "rewards/margins_max": 3.1421637535095215, "rewards/margins_min": -0.5510527491569519, "rewards/margins_std": 1.6671063899993896, "rewards/rejected": -4.129981994628906, "step": 3200 }, { "epoch": 0.84, "eval_logits/chosen": -1.7743639945983887, "eval_logits/rejected": -1.7159403562545776, "eval_logps/chosen": -589.9283447265625, "eval_logps/rejected": -681.7123413085938, "eval_loss": 0.4892239272594452, "eval_rewards/accuracies": 0.7797619104385376, "eval_rewards/chosen": -3.054410457611084, "eval_rewards/margins": 1.141284704208374, "eval_rewards/margins_max": 3.58188533782959, "eval_rewards/margins_min": -0.9045903086662292, "eval_rewards/margins_std": 1.489518642425537, "eval_rewards/rejected": -4.195695400238037, "eval_runtime": 223.8046, "eval_samples_per_second": 8.936, "eval_steps_per_second": 0.281, "step": 3200 }, { "epoch": 0.84, "grad_norm": 5.484993663356462, "learning_rate": 3.796376788925771e-07, "logits/chosen": -1.7911245822906494, "logits/rejected": -1.7494964599609375, "logps/chosen": -557.06298828125, "logps/rejected": -693.9409790039062, "loss": 0.4668, "rewards/accuracies": 0.75, "rewards/chosen": -3.009096145629883, "rewards/margins": 1.270904779434204, "rewards/margins_max": 3.3159403800964355, "rewards/margins_min": -0.6124784350395203, "rewards/margins_std": 1.755610704421997, "rewards/rejected": -4.280001163482666, "step": 3210 }, { "epoch": 0.84, "grad_norm": 19.935753437899425, "learning_rate": 3.676241067609465e-07, "logits/chosen": -1.66033935546875, "logits/rejected": -1.59659743309021, "logps/chosen": -577.2107543945312, "logps/rejected": -658.2227783203125, "loss": 0.5128, "rewards/accuracies": 0.625, "rewards/chosen": -3.122068166732788, "rewards/margins": 0.7911438941955566, "rewards/margins_max": 2.4212658405303955, "rewards/margins_min": -0.9452120065689087, "rewards/margins_std": 1.4669249057769775, "rewards/rejected": -3.913212299346924, "step": 3220 }, { "epoch": 0.85, "grad_norm": 10.888038273325744, "learning_rate": 3.5578858740274976e-07, "logits/chosen": -1.7631572484970093, "logits/rejected": -1.7321088314056396, "logps/chosen": -641.0341186523438, "logps/rejected": -730.53271484375, "loss": 0.5406, "rewards/accuracies": 0.625, "rewards/chosen": -3.253035306930542, "rewards/margins": 0.8025244474411011, "rewards/margins_max": 2.720101833343506, "rewards/margins_min": -0.6991795301437378, "rewards/margins_std": 1.536482810974121, "rewards/rejected": -4.0555596351623535, "step": 3230 }, { "epoch": 0.85, "grad_norm": 9.07574337103118, "learning_rate": 3.44132109080447e-07, "logits/chosen": -1.9020382165908813, "logits/rejected": -1.8031761646270752, "logps/chosen": -554.1954345703125, "logps/rejected": -708.1973876953125, "loss": 0.4896, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.9074912071228027, "rewards/margins": 1.1924735307693481, "rewards/margins_max": 2.999037265777588, "rewards/margins_min": -0.2925456166267395, "rewards/margins_std": 1.5077444314956665, "rewards/rejected": -4.0999650955200195, "step": 3240 }, { "epoch": 0.85, "grad_norm": 10.054142674147277, "learning_rate": 3.3265564510662344e-07, "logits/chosen": -1.8203704357147217, "logits/rejected": -1.7585738897323608, "logps/chosen": -604.6641235351562, "logps/rejected": -749.8712158203125, "loss": 0.4901, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.047290086746216, "rewards/margins": 1.5930215120315552, "rewards/margins_max": 3.043708324432373, "rewards/margins_min": -0.16928577423095703, "rewards/margins_std": 1.3991247415542603, "rewards/rejected": -4.640311241149902, "step": 3250 }, { "epoch": 0.85, "grad_norm": 6.853373967704589, "learning_rate": 3.213601537627195e-07, "logits/chosen": -1.7812211513519287, "logits/rejected": -1.761359453201294, "logps/chosen": -540.857177734375, "logps/rejected": -721.9593505859375, "loss": 0.4597, "rewards/accuracies": 0.875, "rewards/chosen": -2.8823277950286865, "rewards/margins": 1.5945441722869873, "rewards/margins_max": 3.5275206565856934, "rewards/margins_min": -0.3245779871940613, "rewards/margins_std": 1.6751461029052734, "rewards/rejected": -4.476872444152832, "step": 3260 }, { "epoch": 0.86, "grad_norm": 12.820563630614709, "learning_rate": 3.1024657821901063e-07, "logits/chosen": -1.7429959774017334, "logits/rejected": -1.6920688152313232, "logps/chosen": -556.5772094726562, "logps/rejected": -666.9478759765625, "loss": 0.4799, "rewards/accuracies": 0.75, "rewards/chosen": -3.0472941398620605, "rewards/margins": 1.047997236251831, "rewards/margins_max": 2.6931302547454834, "rewards/margins_min": -0.5101019144058228, "rewards/margins_std": 1.4066200256347656, "rewards/rejected": -4.0952911376953125, "step": 3270 }, { "epoch": 0.86, "grad_norm": 14.75068481924547, "learning_rate": 2.9931584645585654e-07, "logits/chosen": -1.7551742792129517, "logits/rejected": -1.6262340545654297, "logps/chosen": -541.9832153320312, "logps/rejected": -629.2506103515625, "loss": 0.5318, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.2883732318878174, "rewards/margins": 1.1573963165283203, "rewards/margins_max": 3.3195719718933105, "rewards/margins_min": -0.8909147381782532, "rewards/margins_std": 1.8491665124893188, "rewards/rejected": -4.445769309997559, "step": 3280 }, { "epoch": 0.86, "grad_norm": 6.8185644192708255, "learning_rate": 2.885688711862136e-07, "logits/chosen": -1.729627251625061, "logits/rejected": -1.6980613470077515, "logps/chosen": -574.3662109375, "logps/rejected": -742.139892578125, "loss": 0.4292, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.7674355506896973, "rewards/margins": 1.7680041790008545, "rewards/margins_max": 3.544017791748047, "rewards/margins_min": 0.13425391912460327, "rewards/margins_std": 1.513333797454834, "rewards/rejected": -4.535439491271973, "step": 3290 }, { "epoch": 0.86, "grad_norm": 6.612253847209845, "learning_rate": 2.7800654977942486e-07, "logits/chosen": -1.8881981372833252, "logits/rejected": -1.8337571620941162, "logps/chosen": -637.9619140625, "logps/rejected": -734.8035888671875, "loss": 0.5186, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.936044692993164, "rewards/margins": 1.0136942863464355, "rewards/margins_max": 2.3992760181427, "rewards/margins_min": -0.30427104234695435, "rewards/margins_std": 1.2224770784378052, "rewards/rejected": -3.9497389793395996, "step": 3300 }, { "epoch": 0.86, "eval_logits/chosen": -1.7796199321746826, "eval_logits/rejected": -1.720741629600525, "eval_logps/chosen": -581.3629150390625, "eval_logps/rejected": -673.4175415039062, "eval_loss": 0.4896455705165863, "eval_rewards/accuracies": 0.773809552192688, "eval_rewards/chosen": -2.9687557220458984, "eval_rewards/margins": 1.1439913511276245, "eval_rewards/margins_max": 3.5867068767547607, "eval_rewards/margins_min": -0.906122088432312, "eval_rewards/margins_std": 1.4962996244430542, "eval_rewards/rejected": -4.112747669219971, "eval_runtime": 223.7601, "eval_samples_per_second": 8.938, "eval_steps_per_second": 0.282, "step": 3300 }, { "epoch": 0.87, "grad_norm": 24.128310218577393, "learning_rate": 2.6762976418628797e-07, "logits/chosen": -1.7186781167984009, "logits/rejected": -1.6149390935897827, "logps/chosen": -504.49346923828125, "logps/rejected": -677.2528076171875, "loss": 0.4727, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.8557281494140625, "rewards/margins": 1.7091875076293945, "rewards/margins_max": 4.201811790466309, "rewards/margins_min": -0.009330714121460915, "rewards/margins_std": 1.9150793552398682, "rewards/rejected": -4.564915657043457, "step": 3310 }, { "epoch": 0.87, "grad_norm": 6.78434492936536, "learning_rate": 2.5743938086541354e-07, "logits/chosen": -1.516067624092102, "logits/rejected": -1.5032888650894165, "logps/chosen": -622.5819091796875, "logps/rejected": -668.0444946289062, "loss": 0.5001, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.17722487449646, "rewards/margins": 1.0904510021209717, "rewards/margins_max": 2.6921486854553223, "rewards/margins_min": -0.3705216944217682, "rewards/margins_std": 1.356029987335205, "rewards/rejected": -4.267675876617432, "step": 3320 }, { "epoch": 0.87, "grad_norm": 9.70318161215369, "learning_rate": 2.4743625071087574e-07, "logits/chosen": -1.7863948345184326, "logits/rejected": -1.8066688776016235, "logps/chosen": -595.3849487304688, "logps/rejected": -696.0786743164062, "loss": 0.4996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.1375272274017334, "rewards/margins": 0.7508841753005981, "rewards/margins_max": 2.1748650074005127, "rewards/margins_min": -0.7836846113204956, "rewards/margins_std": 1.3332493305206299, "rewards/rejected": -3.8884117603302, "step": 3330 }, { "epoch": 0.87, "grad_norm": 8.582000135071041, "learning_rate": 2.3762120898116498e-07, "logits/chosen": -1.781226396560669, "logits/rejected": -1.8100659847259521, "logps/chosen": -563.4840087890625, "logps/rejected": -723.8385620117188, "loss": 0.4219, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.9263954162597656, "rewards/margins": 1.0940673351287842, "rewards/margins_max": 2.865462303161621, "rewards/margins_min": -0.38367947936058044, "rewards/margins_std": 1.4910519123077393, "rewards/rejected": -4.020462989807129, "step": 3340 }, { "epoch": 0.88, "grad_norm": 7.257878725141884, "learning_rate": 2.2799507522944048e-07, "logits/chosen": -1.8165762424468994, "logits/rejected": -1.671443223953247, "logps/chosen": -626.157470703125, "logps/rejected": -725.3187255859375, "loss": 0.516, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.318394422531128, "rewards/margins": 1.3628634214401245, "rewards/margins_max": 3.243999481201172, "rewards/margins_min": -0.18884804844856262, "rewards/margins_std": 1.5075294971466064, "rewards/rejected": -4.681258201599121, "step": 3350 }, { "epoch": 0.88, "grad_norm": 9.643527343711234, "learning_rate": 2.1855865323510056e-07, "logits/chosen": -1.837426781654358, "logits/rejected": -1.7517645359039307, "logps/chosen": -624.3062133789062, "logps/rejected": -779.9801635742188, "loss": 0.519, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.4275622367858887, "rewards/margins": 1.0831588506698608, "rewards/margins_max": 3.481642961502075, "rewards/margins_min": -0.4151192605495453, "rewards/margins_std": 1.7579052448272705, "rewards/rejected": -4.510721206665039, "step": 3360 }, { "epoch": 0.88, "grad_norm": 9.207571513105142, "learning_rate": 2.0931273093666575e-07, "logits/chosen": -1.7617747783660889, "logits/rejected": -1.756715178489685, "logps/chosen": -560.1388549804688, "logps/rejected": -662.7755126953125, "loss": 0.5631, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.8922855854034424, "rewards/margins": 0.9579329490661621, "rewards/margins_max": 3.0555317401885986, "rewards/margins_min": -0.6086864471435547, "rewards/margins_std": 1.6971238851547241, "rewards/rejected": -3.8502182960510254, "step": 3370 }, { "epoch": 0.88, "grad_norm": 7.491561720314219, "learning_rate": 2.002580803659873e-07, "logits/chosen": -1.8423497676849365, "logits/rejected": -1.7013689279556274, "logps/chosen": -588.772216796875, "logps/rejected": -621.0728149414062, "loss": 0.5456, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.0553536415100098, "rewards/margins": 0.7642387747764587, "rewards/margins_max": 2.62900447845459, "rewards/margins_min": -0.8943256139755249, "rewards/margins_std": 1.5821831226348877, "rewards/rejected": -3.8195927143096924, "step": 3380 }, { "epoch": 0.89, "grad_norm": 8.271889517453967, "learning_rate": 1.913954575837826e-07, "logits/chosen": -1.751521110534668, "logits/rejected": -1.6828060150146484, "logps/chosen": -570.3870849609375, "logps/rejected": -725.5081176757812, "loss": 0.5182, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.1388165950775146, "rewards/margins": 1.4453825950622559, "rewards/margins_max": 4.307318210601807, "rewards/margins_min": -0.7347943186759949, "rewards/margins_std": 2.2231533527374268, "rewards/rejected": -4.58419942855835, "step": 3390 }, { "epoch": 0.89, "grad_norm": 8.002943451120714, "learning_rate": 1.827256026165028e-07, "logits/chosen": -1.7623207569122314, "logits/rejected": -1.7001641988754272, "logps/chosen": -555.3650512695312, "logps/rejected": -692.3477783203125, "loss": 0.4699, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.806239366531372, "rewards/margins": 1.272976040840149, "rewards/margins_max": 3.452536106109619, "rewards/margins_min": -0.3631371259689331, "rewards/margins_std": 1.7274888753890991, "rewards/rejected": -4.079215049743652, "step": 3400 }, { "epoch": 0.89, "eval_logits/chosen": -1.7916128635406494, "eval_logits/rejected": -1.7331897020339966, "eval_logps/chosen": -571.2765502929688, "eval_logps/rejected": -662.9918212890625, "eval_loss": 0.4892271161079407, "eval_rewards/accuracies": 0.7757936716079712, "eval_rewards/chosen": -2.867892265319824, "eval_rewards/margins": 1.1405978202819824, "eval_rewards/margins_max": 3.584003210067749, "eval_rewards/margins_min": -0.8920046091079712, "eval_rewards/margins_std": 1.48953378200531, "eval_rewards/rejected": -4.008490085601807, "eval_runtime": 223.7758, "eval_samples_per_second": 8.938, "eval_steps_per_second": 0.282, "step": 3400 }, { "epoch": 0.89, "grad_norm": 8.773257925449636, "learning_rate": 1.7424923939454274e-07, "logits/chosen": -1.543839931488037, "logits/rejected": -1.5583148002624512, "logps/chosen": -543.2760009765625, "logps/rejected": -680.4772338867188, "loss": 0.4967, "rewards/accuracies": 0.625, "rewards/chosen": -3.0283379554748535, "rewards/margins": 1.0932867527008057, "rewards/margins_max": 2.9585251808166504, "rewards/margins_min": -0.5249409675598145, "rewards/margins_std": 1.5251989364624023, "rewards/rejected": -4.12162446975708, "step": 3410 }, { "epoch": 0.9, "grad_norm": 9.20469913718285, "learning_rate": 1.6596707569179304e-07, "logits/chosen": -1.8508853912353516, "logits/rejected": -1.7132911682128906, "logps/chosen": -634.1796875, "logps/rejected": -733.7144775390625, "loss": 0.452, "rewards/accuracies": 0.875, "rewards/chosen": -2.7069268226623535, "rewards/margins": 1.7542641162872314, "rewards/margins_max": 3.632814407348633, "rewards/margins_min": 0.010856258682906628, "rewards/margins_std": 1.5949105024337769, "rewards/rejected": -4.461190700531006, "step": 3420 }, { "epoch": 0.9, "grad_norm": 13.685772512455967, "learning_rate": 1.578798030665385e-07, "logits/chosen": -1.835354208946228, "logits/rejected": -1.742457628250122, "logps/chosen": -605.5134887695312, "logps/rejected": -701.7230224609375, "loss": 0.4871, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.7119264602661133, "rewards/margins": 1.4306535720825195, "rewards/margins_max": 2.9628920555114746, "rewards/margins_min": 0.0812857374548912, "rewards/margins_std": 1.2812762260437012, "rewards/rejected": -4.142580032348633, "step": 3430 }, { "epoch": 0.9, "grad_norm": 9.021051159020875, "learning_rate": 1.499880968037165e-07, "logits/chosen": -1.745772123336792, "logits/rejected": -1.77035653591156, "logps/chosen": -569.849853515625, "logps/rejected": -633.4874267578125, "loss": 0.5818, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.871950626373291, "rewards/margins": 0.5923939943313599, "rewards/margins_max": 2.1505866050720215, "rewards/margins_min": -1.1100825071334839, "rewards/margins_std": 1.4263689517974854, "rewards/rejected": -3.4643447399139404, "step": 3440 }, { "epoch": 0.9, "grad_norm": 9.95611096301224, "learning_rate": 1.4229261585852805e-07, "logits/chosen": -1.8185625076293945, "logits/rejected": -1.7791833877563477, "logps/chosen": -557.3032836914062, "logps/rejected": -679.876953125, "loss": 0.4806, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.8279640674591064, "rewards/margins": 1.41463303565979, "rewards/margins_max": 3.019104480743408, "rewards/margins_min": 0.22974753379821777, "rewards/margins_std": 1.2331206798553467, "rewards/rejected": -4.242597579956055, "step": 3450 }, { "epoch": 0.91, "grad_norm": 8.161884802261707, "learning_rate": 1.3479400280141886e-07, "logits/chosen": -1.785893201828003, "logits/rejected": -1.7111040353775024, "logps/chosen": -601.8311157226562, "logps/rejected": -625.7269287109375, "loss": 0.418, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.843427896499634, "rewards/margins": 1.2628462314605713, "rewards/margins_max": 2.9218990802764893, "rewards/margins_min": -0.4208103120326996, "rewards/margins_std": 1.4482828378677368, "rewards/rejected": -4.106273651123047, "step": 3460 }, { "epoch": 0.91, "grad_norm": 11.129083805401784, "learning_rate": 1.2749288376442044e-07, "logits/chosen": -1.7949390411376953, "logits/rejected": -1.7122758626937866, "logps/chosen": -613.6131591796875, "logps/rejected": -720.678466796875, "loss": 0.5349, "rewards/accuracies": 0.75, "rewards/chosen": -3.3643181324005127, "rewards/margins": 0.9708747863769531, "rewards/margins_max": 2.270894765853882, "rewards/margins_min": -0.38272160291671753, "rewards/margins_std": 1.263656735420227, "rewards/rejected": -4.335193157196045, "step": 3470 }, { "epoch": 0.91, "grad_norm": 6.923912144677099, "learning_rate": 1.203898683888713e-07, "logits/chosen": -1.7012691497802734, "logits/rejected": -1.550444483757019, "logps/chosen": -522.5025634765625, "logps/rejected": -664.4444580078125, "loss": 0.4578, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.962705373764038, "rewards/margins": 1.3390944004058838, "rewards/margins_max": 3.005403995513916, "rewards/margins_min": -0.32086730003356934, "rewards/margins_std": 1.5143851041793823, "rewards/rejected": -4.301799774169922, "step": 3480 }, { "epoch": 0.91, "grad_norm": 16.434457707709264, "learning_rate": 1.1348554977451132e-07, "logits/chosen": -1.7400627136230469, "logits/rejected": -1.7638914585113525, "logps/chosen": -554.029541015625, "logps/rejected": -603.7113037109375, "loss": 0.5148, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.9988150596618652, "rewards/margins": 0.8229666948318481, "rewards/margins_max": 2.091127872467041, "rewards/margins_min": -0.41064515709877014, "rewards/margins_std": 1.090587854385376, "rewards/rejected": -3.821781873703003, "step": 3490 }, { "epoch": 0.92, "grad_norm": 13.327130082663498, "learning_rate": 1.0678050442995802e-07, "logits/chosen": -1.868284821510315, "logits/rejected": -1.7972882986068726, "logps/chosen": -633.5184326171875, "logps/rejected": -707.0253295898438, "loss": 0.4332, "rewards/accuracies": 0.875, "rewards/chosen": -2.695908784866333, "rewards/margins": 1.5813772678375244, "rewards/margins_max": 3.4854698181152344, "rewards/margins_min": 0.1540924310684204, "rewards/margins_std": 1.520946741104126, "rewards/rejected": -4.277285575866699, "step": 3500 }, { "epoch": 0.92, "eval_logits/chosen": -1.7990823984146118, "eval_logits/rejected": -1.74032461643219, "eval_logps/chosen": -569.8724975585938, "eval_logps/rejected": -664.3639526367188, "eval_loss": 0.48903173208236694, "eval_rewards/accuracies": 0.7817460298538208, "eval_rewards/chosen": -2.853851318359375, "eval_rewards/margins": 1.1683602333068848, "eval_rewards/margins_max": 3.66827392578125, "eval_rewards/margins_min": -0.9166216254234314, "eval_rewards/margins_std": 1.5238233804702759, "eval_rewards/rejected": -4.022211074829102, "eval_runtime": 223.8887, "eval_samples_per_second": 8.933, "eval_steps_per_second": 0.281, "step": 3500 }, { "epoch": 0.92, "grad_norm": 10.449342508028803, "learning_rate": 1.0027529222456755e-07, "logits/chosen": -1.7780840396881104, "logits/rejected": -1.707383394241333, "logps/chosen": -579.6513061523438, "logps/rejected": -647.8109130859375, "loss": 0.4961, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.890334367752075, "rewards/margins": 0.9414389729499817, "rewards/margins_max": 2.4241271018981934, "rewards/margins_min": -0.381816565990448, "rewards/margins_std": 1.2437373399734497, "rewards/rejected": -3.831773281097412, "step": 3510 }, { "epoch": 0.92, "grad_norm": 10.352665772574367, "learning_rate": 9.397045634168766e-08, "logits/chosen": -1.7622478008270264, "logits/rejected": -1.7830806970596313, "logps/chosen": -597.2891845703125, "logps/rejected": -700.4714965820312, "loss": 0.5479, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.3346240520477295, "rewards/margins": 1.1042240858078003, "rewards/margins_max": 3.8513972759246826, "rewards/margins_min": -0.898116409778595, "rewards/margins_std": 2.0874717235565186, "rewards/rejected": -4.43884801864624, "step": 3520 }, { "epoch": 0.92, "grad_norm": 8.190245655739954, "learning_rate": 8.78665232332998e-08, "logits/chosen": -1.8807893991470337, "logits/rejected": -1.7697107791900635, "logps/chosen": -542.9649658203125, "logps/rejected": -627.5797729492188, "loss": 0.4922, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.613901376724243, "rewards/margins": 1.2189257144927979, "rewards/margins_max": 3.273794651031494, "rewards/margins_min": -0.4261334538459778, "rewards/margins_std": 1.6996138095855713, "rewards/rejected": -3.832827091217041, "step": 3530 }, { "epoch": 0.93, "grad_norm": 5.82592273656473, "learning_rate": 8.196400257606208e-08, "logits/chosen": -1.7385711669921875, "logits/rejected": -1.607500672340393, "logps/chosen": -567.1375732421875, "logps/rejected": -628.9720458984375, "loss": 0.5338, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.8584647178649902, "rewards/margins": 1.0373350381851196, "rewards/margins_max": 2.482917308807373, "rewards/margins_min": -0.21703216433525085, "rewards/margins_std": 1.2015788555145264, "rewards/rejected": -3.8957996368408203, "step": 3540 }, { "epoch": 0.93, "grad_norm": 8.582929978260264, "learning_rate": 7.626338722875076e-08, "logits/chosen": -1.7206695079803467, "logits/rejected": -1.689666986465454, "logps/chosen": -540.1199340820312, "logps/rejected": -649.2257080078125, "loss": 0.4989, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.769836187362671, "rewards/margins": 1.1024612188339233, "rewards/margins_max": 2.315967559814453, "rewards/margins_min": 0.03381739929318428, "rewards/margins_std": 1.083656668663025, "rewards/rejected": -3.8722972869873047, "step": 3550 }, { "epoch": 0.93, "grad_norm": 13.111247677998769, "learning_rate": 7.076515319110688e-08, "logits/chosen": -1.6614410877227783, "logits/rejected": -1.5550636053085327, "logps/chosen": -573.9310913085938, "logps/rejected": -729.1341552734375, "loss": 0.4396, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.936641216278076, "rewards/margins": 1.8196824789047241, "rewards/margins_max": 4.208040714263916, "rewards/margins_min": -0.0002771258295979351, "rewards/margins_std": 1.916425347328186, "rewards/rejected": -4.75632381439209, "step": 3560 }, { "epoch": 0.93, "grad_norm": 7.899428489389873, "learning_rate": 6.54697595640899e-08, "logits/chosen": -1.7465639114379883, "logits/rejected": -1.7264324426651, "logps/chosen": -625.2103271484375, "logps/rejected": -660.3693237304688, "loss": 0.4912, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0198028087615967, "rewards/margins": 0.9401667714118958, "rewards/margins_max": 2.665494441986084, "rewards/margins_min": -0.3367575705051422, "rewards/margins_std": 1.3359119892120361, "rewards/rejected": -3.9599692821502686, "step": 3570 }, { "epoch": 0.94, "grad_norm": 12.078188166878807, "learning_rate": 6.037764851154426e-08, "logits/chosen": -1.881382942199707, "logits/rejected": -1.7682117223739624, "logps/chosen": -602.8654174804688, "logps/rejected": -702.8894653320312, "loss": 0.5503, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.947944164276123, "rewards/margins": 1.3986036777496338, "rewards/margins_max": 3.8292274475097656, "rewards/margins_min": -0.677720844745636, "rewards/margins_std": 2.000092029571533, "rewards/rejected": -4.346548080444336, "step": 3580 }, { "epoch": 0.94, "grad_norm": 7.623866262729966, "learning_rate": 5.548924522327748e-08, "logits/chosen": -1.6796060800552368, "logits/rejected": -1.6805486679077148, "logps/chosen": -497.21533203125, "logps/rejected": -598.9744873046875, "loss": 0.4124, "rewards/accuracies": 0.75, "rewards/chosen": -2.8857789039611816, "rewards/margins": 1.0334358215332031, "rewards/margins_max": 2.586656332015991, "rewards/margins_min": -0.34856879711151123, "rewards/margins_std": 1.3107221126556396, "rewards/rejected": -3.9192147254943848, "step": 3590 }, { "epoch": 0.94, "grad_norm": 11.40810264277996, "learning_rate": 5.0804957879556915e-08, "logits/chosen": -1.6065425872802734, "logits/rejected": -1.5580356121063232, "logps/chosen": -554.305419921875, "logps/rejected": -708.8319091796875, "loss": 0.5292, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.969703197479248, "rewards/margins": 1.3115341663360596, "rewards/margins_max": 3.591161012649536, "rewards/margins_min": -0.9266679883003235, "rewards/margins_std": 2.119549036026001, "rewards/rejected": -4.281237602233887, "step": 3600 }, { "epoch": 0.94, "eval_logits/chosen": -1.791983723640442, "eval_logits/rejected": -1.7327070236206055, "eval_logps/chosen": -576.9282836914062, "eval_logps/rejected": -672.2607421875, "eval_loss": 0.4888196289539337, "eval_rewards/accuracies": 0.7757936716079712, "eval_rewards/chosen": -2.9244093894958496, "eval_rewards/margins": 1.176769495010376, "eval_rewards/margins_max": 3.694626569747925, "eval_rewards/margins_min": -0.9284708499908447, "eval_rewards/margins_std": 1.535569667816162, "eval_rewards/rejected": -4.1011786460876465, "eval_runtime": 223.8251, "eval_samples_per_second": 8.936, "eval_steps_per_second": 0.281, "step": 3600 }, { "epoch": 0.94, "grad_norm": 13.084672280253741, "learning_rate": 4.632517761702815e-08, "logits/chosen": -1.6286007165908813, "logits/rejected": -1.6486108303070068, "logps/chosen": -508.33050537109375, "logps/rejected": -584.3656005859375, "loss": 0.4874, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.6960370540618896, "rewards/margins": 0.8862608075141907, "rewards/margins_max": 2.169489860534668, "rewards/margins_min": -0.5621198415756226, "rewards/margins_std": 1.2759290933609009, "rewards/rejected": -3.5822978019714355, "step": 3610 }, { "epoch": 0.95, "grad_norm": 9.382140428548066, "learning_rate": 4.205027849605359e-08, "logits/chosen": -1.694944977760315, "logits/rejected": -1.623186469078064, "logps/chosen": -514.87353515625, "logps/rejected": -630.6694946289062, "loss": 0.5199, "rewards/accuracies": 0.75, "rewards/chosen": -2.794494867324829, "rewards/margins": 1.0709021091461182, "rewards/margins_max": 2.9564125537872314, "rewards/margins_min": -0.5829165577888489, "rewards/margins_std": 1.6146777868270874, "rewards/rejected": -3.865396499633789, "step": 3620 }, { "epoch": 0.95, "grad_norm": 9.541908583149374, "learning_rate": 3.798061746947995e-08, "logits/chosen": -1.6331926584243774, "logits/rejected": -1.5650359392166138, "logps/chosen": -586.0941162109375, "logps/rejected": -751.4549560546875, "loss": 0.5037, "rewards/accuracies": 0.75, "rewards/chosen": -3.374192714691162, "rewards/margins": 1.0789440870285034, "rewards/margins_max": 3.2678732872009277, "rewards/margins_min": -0.6941253542900085, "rewards/margins_std": 1.7678693532943726, "rewards/rejected": -4.453136920928955, "step": 3630 }, { "epoch": 0.95, "grad_norm": 10.524249458282346, "learning_rate": 3.411653435283158e-08, "logits/chosen": -1.7818365097045898, "logits/rejected": -1.727113127708435, "logps/chosen": -548.9638061523438, "logps/rejected": -586.8460693359375, "loss": 0.6011, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.8288981914520264, "rewards/margins": 0.6869390606880188, "rewards/margins_max": 2.5096938610076904, "rewards/margins_min": -1.031667947769165, "rewards/margins_std": 1.6567041873931885, "rewards/rejected": -3.515836715698242, "step": 3640 }, { "epoch": 0.96, "grad_norm": 15.49412819176515, "learning_rate": 3.04583517959367e-08, "logits/chosen": -1.7403959035873413, "logits/rejected": -1.7668066024780273, "logps/chosen": -562.3262329101562, "logps/rejected": -675.0917358398438, "loss": 0.5439, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.7953028678894043, "rewards/margins": 0.6667811274528503, "rewards/margins_max": 2.6268081665039062, "rewards/margins_min": -0.9130493998527527, "rewards/margins_std": 1.567184567451477, "rewards/rejected": -3.4620842933654785, "step": 3650 }, { "epoch": 0.96, "grad_norm": 20.670871101851876, "learning_rate": 2.7006375255985984e-08, "logits/chosen": -1.955435037612915, "logits/rejected": -1.9204410314559937, "logps/chosen": -552.5899658203125, "logps/rejected": -596.6300048828125, "loss": 0.5383, "rewards/accuracies": 0.75, "rewards/chosen": -2.7342145442962646, "rewards/margins": 0.5816947221755981, "rewards/margins_max": 1.4977777004241943, "rewards/margins_min": -0.6119095087051392, "rewards/margins_std": 0.9219714999198914, "rewards/rejected": -3.3159091472625732, "step": 3660 }, { "epoch": 0.96, "grad_norm": 9.498604119796202, "learning_rate": 2.3760892972027328e-08, "logits/chosen": -1.7586755752563477, "logits/rejected": -1.7237780094146729, "logps/chosen": -582.6248779296875, "logps/rejected": -704.2200927734375, "loss": 0.5043, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8726859092712402, "rewards/margins": 1.117175817489624, "rewards/margins_max": 3.0784335136413574, "rewards/margins_min": -0.34074777364730835, "rewards/margins_std": 1.5470244884490967, "rewards/rejected": -3.989861249923706, "step": 3670 }, { "epoch": 0.96, "grad_norm": 10.573748315634472, "learning_rate": 2.072217594089765e-08, "logits/chosen": -1.7246586084365845, "logits/rejected": -1.6840505599975586, "logps/chosen": -577.3232421875, "logps/rejected": -693.5184326171875, "loss": 0.4901, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8268446922302246, "rewards/margins": 1.3486244678497314, "rewards/margins_max": 2.9450156688690186, "rewards/margins_min": -0.2968614995479584, "rewards/margins_std": 1.410911202430725, "rewards/rejected": -4.175469398498535, "step": 3680 }, { "epoch": 0.97, "grad_norm": 9.293412892654635, "learning_rate": 1.789047789459375e-08, "logits/chosen": -1.7801170349121094, "logits/rejected": -1.7285743951797485, "logps/chosen": -620.8328857421875, "logps/rejected": -644.5830078125, "loss": 0.4577, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.898940324783325, "rewards/margins": 1.0569028854370117, "rewards/margins_max": 2.875457763671875, "rewards/margins_min": -0.8081210851669312, "rewards/margins_std": 1.5787400007247925, "rewards/rejected": -3.955843448638916, "step": 3690 }, { "epoch": 0.97, "grad_norm": 8.662122806852608, "learning_rate": 1.5266035279088708e-08, "logits/chosen": -1.9422146081924438, "logits/rejected": -1.88083016872406, "logps/chosen": -591.3344116210938, "logps/rejected": -657.3201904296875, "loss": 0.5462, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.6033148765563965, "rewards/margins": 1.1323473453521729, "rewards/margins_max": 2.659496307373047, "rewards/margins_min": -0.3541935384273529, "rewards/margins_std": 1.4090238809585571, "rewards/rejected": -3.7356624603271484, "step": 3700 }, { "epoch": 0.97, "eval_logits/chosen": -1.798087477684021, "eval_logits/rejected": -1.739272117614746, "eval_logps/chosen": -573.77587890625, "eval_logps/rejected": -668.7319946289062, "eval_loss": 0.4888877272605896, "eval_rewards/accuracies": 0.7757936716079712, "eval_rewards/chosen": -2.892885446548462, "eval_rewards/margins": 1.1730064153671265, "eval_rewards/margins_max": 3.681614637374878, "eval_rewards/margins_min": -0.9250013828277588, "eval_rewards/margins_std": 1.5308994054794312, "eval_rewards/rejected": -4.065891742706299, "eval_runtime": 224.1592, "eval_samples_per_second": 8.922, "eval_steps_per_second": 0.281, "step": 3700 }, { "epoch": 0.97, "grad_norm": 10.187315769519007, "learning_rate": 1.2849067234584623e-08, "logits/chosen": -1.6541836261749268, "logits/rejected": -1.6423285007476807, "logps/chosen": -562.486328125, "logps/rejected": -644.6295166015625, "loss": 0.4818, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.590837001800537, "rewards/margins": 1.0915751457214355, "rewards/margins_max": 2.5394041538238525, "rewards/margins_min": -0.0458584800362587, "rewards/margins_std": 1.152280330657959, "rewards/rejected": -3.6824119091033936, "step": 3710 }, { "epoch": 0.97, "grad_norm": 10.90555113381376, "learning_rate": 1.0639775577218625e-08, "logits/chosen": -1.8055120706558228, "logits/rejected": -1.7897742986679077, "logps/chosen": -553.2310791015625, "logps/rejected": -635.522216796875, "loss": 0.4983, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.527087688446045, "rewards/margins": 1.170078992843628, "rewards/margins_max": 3.445094347000122, "rewards/margins_min": -0.39945468306541443, "rewards/margins_std": 1.805127501487732, "rewards/rejected": -3.697166919708252, "step": 3720 }, { "epoch": 0.98, "grad_norm": 11.92290271414052, "learning_rate": 8.638344782207486e-09, "logits/chosen": -1.9485435485839844, "logits/rejected": -1.8312652111053467, "logps/chosen": -623.838623046875, "logps/rejected": -668.1798095703125, "loss": 0.5769, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.8405277729034424, "rewards/margins": 1.2211335897445679, "rewards/margins_max": 2.975924015045166, "rewards/margins_min": -0.769484281539917, "rewards/margins_std": 1.7036542892456055, "rewards/rejected": -4.061661243438721, "step": 3730 }, { "epoch": 0.98, "grad_norm": 5.436474933685416, "learning_rate": 6.84494196844715e-09, "logits/chosen": -1.8510373830795288, "logits/rejected": -1.7533117532730103, "logps/chosen": -599.0010375976562, "logps/rejected": -741.609375, "loss": 0.4236, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.64265775680542, "rewards/margins": 1.4838783740997314, "rewards/margins_max": 3.2085087299346924, "rewards/margins_min": 0.04103611782193184, "rewards/margins_std": 1.4578731060028076, "rewards/rejected": -4.1265363693237305, "step": 3740 }, { "epoch": 0.98, "grad_norm": 6.067214234062591, "learning_rate": 5.259716884556121e-09, "logits/chosen": -1.732984185218811, "logits/rejected": -1.7672417163848877, "logps/chosen": -609.24951171875, "logps/rejected": -637.6512451171875, "loss": 0.4833, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.1531975269317627, "rewards/margins": 0.5745351314544678, "rewards/margins_max": 1.994765043258667, "rewards/margins_min": -0.4752461314201355, "rewards/margins_std": 1.1169458627700806, "rewards/rejected": -3.7277324199676514, "step": 3750 }, { "epoch": 0.98, "grad_norm": 4.128995405405694, "learning_rate": 3.882801896372967e-09, "logits/chosen": -1.7863538265228271, "logits/rejected": -1.8078734874725342, "logps/chosen": -514.0599365234375, "logps/rejected": -657.2833251953125, "loss": 0.4701, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.792879581451416, "rewards/margins": 1.402199625968933, "rewards/margins_max": 3.359687089920044, "rewards/margins_min": -0.5423997640609741, "rewards/margins_std": 1.8150278329849243, "rewards/rejected": -4.1950788497924805, "step": 3760 }, { "epoch": 0.99, "grad_norm": 7.92636216066682, "learning_rate": 2.7143119759026614e-09, "logits/chosen": -1.776789665222168, "logits/rejected": -1.7026447057724, "logps/chosen": -562.1932373046875, "logps/rejected": -743.6436767578125, "loss": 0.4404, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.546682119369507, "rewards/margins": 1.5486358404159546, "rewards/margins_max": 3.0837626457214355, "rewards/margins_min": -0.13449308276176453, "rewards/margins_std": 1.4089680910110474, "rewards/rejected": -4.09531831741333, "step": 3770 }, { "epoch": 0.99, "grad_norm": 9.416074571027378, "learning_rate": 1.754344691717591e-09, "logits/chosen": -1.7687597274780273, "logits/rejected": -1.6748729944229126, "logps/chosen": -559.87060546875, "logps/rejected": -682.3499755859375, "loss": 0.4873, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.637023687362671, "rewards/margins": 1.4059607982635498, "rewards/margins_max": 4.058658599853516, "rewards/margins_min": -0.2294035255908966, "rewards/margins_std": 1.8974485397338867, "rewards/rejected": -4.042984962463379, "step": 3780 }, { "epoch": 0.99, "grad_norm": 15.781454013223382, "learning_rate": 1.0029802008096335e-09, "logits/chosen": -1.7541720867156982, "logits/rejected": -1.7384878396987915, "logps/chosen": -561.750244140625, "logps/rejected": -604.4483032226562, "loss": 0.5544, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.141162395477295, "rewards/margins": 0.7217429280281067, "rewards/margins_max": 2.694789409637451, "rewards/margins_min": -1.0435711145401, "rewards/margins_std": 1.680803656578064, "rewards/rejected": -3.862905502319336, "step": 3790 }, { "epoch": 0.99, "grad_norm": 14.072748002011725, "learning_rate": 4.602812418974534e-10, "logits/chosen": -1.5986223220825195, "logits/rejected": -1.5537407398223877, "logps/chosen": -539.7682495117188, "logps/rejected": -601.5771484375, "loss": 0.4859, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.683328628540039, "rewards/margins": 1.3443286418914795, "rewards/margins_max": 3.07804799079895, "rewards/margins_min": -0.17747122049331665, "rewards/margins_std": 1.4419562816619873, "rewards/rejected": -4.0276570320129395, "step": 3800 }, { "epoch": 0.99, "eval_logits/chosen": -1.7996927499771118, "eval_logits/rejected": -1.740780234336853, "eval_logps/chosen": -574.4192504882812, "eval_logps/rejected": -669.5308227539062, "eval_loss": 0.48893651366233826, "eval_rewards/accuracies": 0.7777777910232544, "eval_rewards/chosen": -2.8993194103240967, "eval_rewards/margins": 1.174560785293579, "eval_rewards/margins_max": 3.685622453689575, "eval_rewards/margins_min": -0.9284926652908325, "eval_rewards/margins_std": 1.5334159135818481, "eval_rewards/rejected": -4.073880195617676, "eval_runtime": 223.854, "eval_samples_per_second": 8.934, "eval_steps_per_second": 0.281, "step": 3800 }, { "epoch": 1.0, "grad_norm": 13.248175729369308, "learning_rate": 1.2629313018819312e-10, "logits/chosen": -1.8308868408203125, "logits/rejected": -1.776044487953186, "logps/chosen": -621.7887573242188, "logps/rejected": -651.1958618164062, "loss": 0.5429, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.088937759399414, "rewards/margins": 0.713749885559082, "rewards/margins_max": 2.1206881999969482, "rewards/margins_min": -0.6197047829627991, "rewards/margins_std": 1.2205471992492676, "rewards/rejected": -3.8026881217956543, "step": 3810 }, { "epoch": 1.0, "grad_norm": 14.146203794202506, "learning_rate": 1.0437535929996855e-12, "logits/chosen": -1.7493774890899658, "logits/rejected": -1.7270513772964478, "logps/chosen": -666.0904541015625, "logps/rejected": -615.6712036132812, "loss": 0.5047, "rewards/accuracies": 0.625, "rewards/chosen": -3.3193583488464355, "rewards/margins": 0.6649240851402283, "rewards/margins_max": 1.9415212869644165, "rewards/margins_min": -0.6283342242240906, "rewards/margins_std": 1.110640048980713, "rewards/rejected": -3.9842822551727295, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.5262645053336271, "train_runtime": 24722.998, "train_samples_per_second": 2.473, "train_steps_per_second": 0.155 } ], "logging_steps": 10, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }