diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18200 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999297541394882, + "eval_steps": 400, + "global_step": 5604, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002676032781401572, + "grad_norm": 6.702973036038713, + "learning_rate": 8.9126559714795e-09, + "logits/chosen": -0.06859037280082703, + "logits/rejected": 0.14135734736919403, + "logps/chosen": -1.716321349143982, + "logps/rejected": -1.8896639347076416, + "loss": 0.8499, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.716321349143982, + "rewards/margins": 0.17334221303462982, + "rewards/rejected": -1.8896639347076416, + "sft_loss": 1.4685341119766235, + "step": 5 + }, + { + "epoch": 0.005352065562803144, + "grad_norm": 9.644141530369794, + "learning_rate": 1.7825311942959e-08, + "logits/chosen": -0.006559779401868582, + "logits/rejected": 0.11489315330982208, + "logps/chosen": -1.802130937576294, + "logps/rejected": -1.8450310230255127, + "loss": 0.9404, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.802130937576294, + "rewards/margins": 0.04290003329515457, + "rewards/rejected": -1.8450310230255127, + "sft_loss": 1.5081868171691895, + "step": 10 + }, + { + "epoch": 0.008028098344204716, + "grad_norm": 10.051611837677612, + "learning_rate": 2.67379679144385e-08, + "logits/chosen": -0.03827698528766632, + "logits/rejected": 0.061770737171173096, + "logps/chosen": -1.6346393823623657, + "logps/rejected": -1.7642667293548584, + "loss": 0.9395, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.6346393823623657, + "rewards/margins": 0.12962760031223297, + "rewards/rejected": -1.7642667293548584, + "sft_loss": 1.500527024269104, + "step": 15 + }, + { + "epoch": 0.010704131125606288, + "grad_norm": 6.647704350122245, + "learning_rate": 3.5650623885918e-08, + "logits/chosen": -0.039084356278181076, + "logits/rejected": 0.04886917397379875, + "logps/chosen": -1.7247798442840576, + "logps/rejected": -1.8060953617095947, + "loss": 0.9581, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.7247798442840576, + "rewards/margins": 0.08131532371044159, + "rewards/rejected": -1.8060953617095947, + "sft_loss": 1.5003466606140137, + "step": 20 + }, + { + "epoch": 0.013380163907007862, + "grad_norm": 14.90062696498156, + "learning_rate": 4.45632798573975e-08, + "logits/chosen": -0.048385441303253174, + "logits/rejected": 0.040979672223329544, + "logps/chosen": -1.8698208332061768, + "logps/rejected": -1.7785001993179321, + "loss": 1.0964, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.8698208332061768, + "rewards/margins": -0.09132039546966553, + "rewards/rejected": -1.7785001993179321, + "sft_loss": 1.5456753969192505, + "step": 25 + }, + { + "epoch": 0.016056196688409432, + "grad_norm": 8.344668876349031, + "learning_rate": 5.3475935828877e-08, + "logits/chosen": -0.08016878366470337, + "logits/rejected": 0.016008157283067703, + "logps/chosen": -1.908936858177185, + "logps/rejected": -1.8325151205062866, + "loss": 0.9981, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.908936858177185, + "rewards/margins": -0.07642142474651337, + "rewards/rejected": -1.8325151205062866, + "sft_loss": 1.6472301483154297, + "step": 30 + }, + { + "epoch": 0.018732229469811006, + "grad_norm": 8.961052501074231, + "learning_rate": 6.23885918003565e-08, + "logits/chosen": -0.05137736722826958, + "logits/rejected": 0.11413037776947021, + "logps/chosen": -1.8463821411132812, + "logps/rejected": -1.9967739582061768, + "loss": 0.9789, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.8463821411132812, + "rewards/margins": 0.15039141476154327, + "rewards/rejected": -1.9967739582061768, + "sft_loss": 1.5614362955093384, + "step": 35 + }, + { + "epoch": 0.021408262251212576, + "grad_norm": 8.308376321072556, + "learning_rate": 7.1301247771836e-08, + "logits/chosen": 0.028514528647065163, + "logits/rejected": 0.20667286217212677, + "logps/chosen": -1.8810441493988037, + "logps/rejected": -1.7430187463760376, + "loss": 1.0385, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -1.8810441493988037, + "rewards/margins": -0.138025164604187, + "rewards/rejected": -1.7430187463760376, + "sft_loss": 1.5189718008041382, + "step": 40 + }, + { + "epoch": 0.02408429503261415, + "grad_norm": 13.986161027627224, + "learning_rate": 8.021390374331551e-08, + "logits/chosen": 0.020828912034630775, + "logits/rejected": 0.22010770440101624, + "logps/chosen": -1.836972951889038, + "logps/rejected": -1.870734453201294, + "loss": 0.9983, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.836972951889038, + "rewards/margins": 0.03376161307096481, + "rewards/rejected": -1.870734453201294, + "sft_loss": 1.5358479022979736, + "step": 45 + }, + { + "epoch": 0.026760327814015723, + "grad_norm": 11.407268722651331, + "learning_rate": 8.9126559714795e-08, + "logits/chosen": -0.05352424457669258, + "logits/rejected": 0.09972499310970306, + "logps/chosen": -1.8980283737182617, + "logps/rejected": -1.7786309719085693, + "loss": 1.0466, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8980283737182617, + "rewards/margins": -0.11939746141433716, + "rewards/rejected": -1.7786309719085693, + "sft_loss": 1.5829687118530273, + "step": 50 + }, + { + "epoch": 0.029436360595417294, + "grad_norm": 7.521526699655726, + "learning_rate": 9.80392156862745e-08, + "logits/chosen": -0.11135170608758926, + "logits/rejected": 0.11175274848937988, + "logps/chosen": -1.8340307474136353, + "logps/rejected": -1.8674598932266235, + "loss": 1.0073, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.8340307474136353, + "rewards/margins": 0.033429183065891266, + "rewards/rejected": -1.8674598932266235, + "sft_loss": 1.5837171077728271, + "step": 55 + }, + { + "epoch": 0.032112393376818864, + "grad_norm": 7.480466581705964, + "learning_rate": 1.06951871657754e-07, + "logits/chosen": -0.08121562004089355, + "logits/rejected": 0.11202778667211533, + "logps/chosen": -1.787698745727539, + "logps/rejected": -1.892869234085083, + "loss": 0.9183, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.787698745727539, + "rewards/margins": 0.10517048835754395, + "rewards/rejected": -1.892869234085083, + "sft_loss": 1.543967366218567, + "step": 60 + }, + { + "epoch": 0.03478842615822044, + "grad_norm": 6.295629203227723, + "learning_rate": 1.158645276292335e-07, + "logits/chosen": -0.024999063462018967, + "logits/rejected": 0.12647823989391327, + "logps/chosen": -1.6358072757720947, + "logps/rejected": -1.7663062810897827, + "loss": 0.8902, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.6358072757720947, + "rewards/margins": 0.13049918413162231, + "rewards/rejected": -1.7663062810897827, + "sft_loss": 1.474139928817749, + "step": 65 + }, + { + "epoch": 0.03746445893962201, + "grad_norm": 11.701055217688808, + "learning_rate": 1.24777183600713e-07, + "logits/chosen": -0.08039408177137375, + "logits/rejected": 0.07166466116905212, + "logps/chosen": -1.7646785974502563, + "logps/rejected": -1.8103328943252563, + "loss": 1.0079, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -1.7646785974502563, + "rewards/margins": 0.04565427824854851, + "rewards/rejected": -1.8103328943252563, + "sft_loss": 1.6289875507354736, + "step": 70 + }, + { + "epoch": 0.04014049172102358, + "grad_norm": 13.253269721585566, + "learning_rate": 1.3368983957219251e-07, + "logits/chosen": -0.0566411130130291, + "logits/rejected": 0.12927421927452087, + "logps/chosen": -1.7753467559814453, + "logps/rejected": -2.0347797870635986, + "loss": 0.8738, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.7753467559814453, + "rewards/margins": 0.2594330310821533, + "rewards/rejected": -2.0347797870635986, + "sft_loss": 1.5650640726089478, + "step": 75 + }, + { + "epoch": 0.04281652450242515, + "grad_norm": 8.386802086569567, + "learning_rate": 1.42602495543672e-07, + "logits/chosen": -0.005293454043567181, + "logits/rejected": 0.10028276592493057, + "logps/chosen": -1.7161533832550049, + "logps/rejected": -1.748827576637268, + "loss": 0.9613, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.7161533832550049, + "rewards/margins": 0.03267427533864975, + "rewards/rejected": -1.748827576637268, + "sft_loss": 1.5250723361968994, + "step": 80 + }, + { + "epoch": 0.04549255728382673, + "grad_norm": 6.061474731949857, + "learning_rate": 1.5151515151515152e-07, + "logits/chosen": -0.14668717980384827, + "logits/rejected": 0.10416042804718018, + "logps/chosen": -1.7873996496200562, + "logps/rejected": -1.9631637334823608, + "loss": 0.9281, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.7873996496200562, + "rewards/margins": 0.1757640838623047, + "rewards/rejected": -1.9631637334823608, + "sft_loss": 1.4945319890975952, + "step": 85 + }, + { + "epoch": 0.0481685900652283, + "grad_norm": 14.741981588462846, + "learning_rate": 1.6042780748663102e-07, + "logits/chosen": 0.08980236947536469, + "logits/rejected": 0.05404208227992058, + "logps/chosen": -1.7418218851089478, + "logps/rejected": -1.771594762802124, + "loss": 1.0002, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.7418218851089478, + "rewards/margins": 0.02977297641336918, + "rewards/rejected": -1.771594762802124, + "sft_loss": 1.4563801288604736, + "step": 90 + }, + { + "epoch": 0.05084462284662987, + "grad_norm": 5.61298553003334, + "learning_rate": 1.693404634581105e-07, + "logits/chosen": -0.07583034038543701, + "logits/rejected": 0.07639636844396591, + "logps/chosen": -1.788304090499878, + "logps/rejected": -1.9007819890975952, + "loss": 0.9449, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.788304090499878, + "rewards/margins": 0.11247781664133072, + "rewards/rejected": -1.9007819890975952, + "sft_loss": 1.5202165842056274, + "step": 95 + }, + { + "epoch": 0.05352065562803145, + "grad_norm": 5.958519674378306, + "learning_rate": 1.7825311942959e-07, + "logits/chosen": -0.02949271723628044, + "logits/rejected": 0.037863839417696, + "logps/chosen": -1.6825199127197266, + "logps/rejected": -1.7904939651489258, + "loss": 0.9087, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.6825199127197266, + "rewards/margins": 0.10797406733036041, + "rewards/rejected": -1.7904939651489258, + "sft_loss": 1.4878851175308228, + "step": 100 + }, + { + "epoch": 0.05619668840943302, + "grad_norm": 8.883510402309446, + "learning_rate": 1.8716577540106952e-07, + "logits/chosen": 0.054995059967041016, + "logits/rejected": 0.08293595165014267, + "logps/chosen": -1.6364829540252686, + "logps/rejected": -1.803472876548767, + "loss": 0.8893, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.6364829540252686, + "rewards/margins": 0.16699011623859406, + "rewards/rejected": -1.803472876548767, + "sft_loss": 1.4342707395553589, + "step": 105 + }, + { + "epoch": 0.05887272119083459, + "grad_norm": 6.887732422302257, + "learning_rate": 1.96078431372549e-07, + "logits/chosen": 0.014778594486415386, + "logits/rejected": 0.11244583129882812, + "logps/chosen": -1.669877290725708, + "logps/rejected": -1.7259242534637451, + "loss": 0.9626, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.669877290725708, + "rewards/margins": 0.0560469925403595, + "rewards/rejected": -1.7259242534637451, + "sft_loss": 1.4641985893249512, + "step": 110 + }, + { + "epoch": 0.06154875397223616, + "grad_norm": 10.053029624630458, + "learning_rate": 2.049910873440285e-07, + "logits/chosen": 0.01813269779086113, + "logits/rejected": 0.22458188235759735, + "logps/chosen": -1.657580018043518, + "logps/rejected": -1.9396737813949585, + "loss": 0.834, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.657580018043518, + "rewards/margins": 0.28209394216537476, + "rewards/rejected": -1.9396737813949585, + "sft_loss": 1.563565969467163, + "step": 115 + }, + { + "epoch": 0.06422478675363773, + "grad_norm": 6.716771226271919, + "learning_rate": 2.13903743315508e-07, + "logits/chosen": -0.0731058418750763, + "logits/rejected": 0.10547780990600586, + "logps/chosen": -1.7298002243041992, + "logps/rejected": -1.8506269454956055, + "loss": 0.9118, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.7298002243041992, + "rewards/margins": 0.12082656472921371, + "rewards/rejected": -1.8506269454956055, + "sft_loss": 1.5554392337799072, + "step": 120 + }, + { + "epoch": 0.0669008195350393, + "grad_norm": 7.206509063947677, + "learning_rate": 2.2281639928698751e-07, + "logits/chosen": -0.0829402357339859, + "logits/rejected": 0.049699828028678894, + "logps/chosen": -1.6518990993499756, + "logps/rejected": -1.6049795150756836, + "loss": 0.992, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.6518990993499756, + "rewards/margins": -0.04691971093416214, + "rewards/rejected": -1.6049795150756836, + "sft_loss": 1.5169137716293335, + "step": 125 + }, + { + "epoch": 0.06957685231644088, + "grad_norm": 9.398500000703004, + "learning_rate": 2.31729055258467e-07, + "logits/chosen": 0.039764903485774994, + "logits/rejected": 0.1792357712984085, + "logps/chosen": -1.6893419027328491, + "logps/rejected": -1.8123197555541992, + "loss": 0.8713, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.6893419027328491, + "rewards/margins": 0.12297798693180084, + "rewards/rejected": -1.8123197555541992, + "sft_loss": 1.577155590057373, + "step": 130 + }, + { + "epoch": 0.07225288509784245, + "grad_norm": 16.324023931321047, + "learning_rate": 2.406417112299465e-07, + "logits/chosen": -0.047886431217193604, + "logits/rejected": 0.07469049841165543, + "logps/chosen": -1.7392584085464478, + "logps/rejected": -1.7592836618423462, + "loss": 0.9732, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.7392584085464478, + "rewards/margins": 0.020025230944156647, + "rewards/rejected": -1.7592836618423462, + "sft_loss": 1.5168216228485107, + "step": 135 + }, + { + "epoch": 0.07492891787924402, + "grad_norm": 11.546083545636698, + "learning_rate": 2.49554367201426e-07, + "logits/chosen": -0.038810618221759796, + "logits/rejected": 0.1334814727306366, + "logps/chosen": -1.6959807872772217, + "logps/rejected": -1.8378652334213257, + "loss": 0.8841, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.6959807872772217, + "rewards/margins": 0.141884446144104, + "rewards/rejected": -1.8378652334213257, + "sft_loss": 1.5605138540267944, + "step": 140 + }, + { + "epoch": 0.0776049506606456, + "grad_norm": 10.449440333498194, + "learning_rate": 2.5846702317290554e-07, + "logits/chosen": -0.02952158823609352, + "logits/rejected": 0.12635311484336853, + "logps/chosen": -1.611771583557129, + "logps/rejected": -1.7252804040908813, + "loss": 0.898, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.611771583557129, + "rewards/margins": 0.11350883543491364, + "rewards/rejected": -1.7252804040908813, + "sft_loss": 1.5070559978485107, + "step": 145 + }, + { + "epoch": 0.08028098344204716, + "grad_norm": 11.254343815302727, + "learning_rate": 2.6737967914438503e-07, + "logits/chosen": -0.047827668488025665, + "logits/rejected": 0.12055377662181854, + "logps/chosen": -1.565514326095581, + "logps/rejected": -1.564286470413208, + "loss": 0.9705, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.565514326095581, + "rewards/margins": -0.0012281477684155107, + "rewards/rejected": -1.564286470413208, + "sft_loss": 1.3673644065856934, + "step": 150 + }, + { + "epoch": 0.08295701622344874, + "grad_norm": 9.262506789237806, + "learning_rate": 2.762923351158645e-07, + "logits/chosen": -0.07001235336065292, + "logits/rejected": -0.018733523786067963, + "logps/chosen": -1.5835145711898804, + "logps/rejected": -1.6756055355072021, + "loss": 0.9092, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.5835145711898804, + "rewards/margins": 0.09209098666906357, + "rewards/rejected": -1.6756055355072021, + "sft_loss": 1.4590342044830322, + "step": 155 + }, + { + "epoch": 0.0856330490048503, + "grad_norm": 8.130608494491169, + "learning_rate": 2.85204991087344e-07, + "logits/chosen": -0.14981473982334137, + "logits/rejected": -0.00554328877478838, + "logps/chosen": -1.7048254013061523, + "logps/rejected": -1.6821569204330444, + "loss": 0.9968, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.7048254013061523, + "rewards/margins": -0.02266838401556015, + "rewards/rejected": -1.6821569204330444, + "sft_loss": 1.5138128995895386, + "step": 160 + }, + { + "epoch": 0.08830908178625188, + "grad_norm": 8.446134985236093, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -0.0554620735347271, + "logits/rejected": 0.11860129982233047, + "logps/chosen": -1.5411068201065063, + "logps/rejected": -1.6820226907730103, + "loss": 0.9089, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.5411068201065063, + "rewards/margins": 0.14091593027114868, + "rewards/rejected": -1.6820226907730103, + "sft_loss": 1.4017772674560547, + "step": 165 + }, + { + "epoch": 0.09098511456765346, + "grad_norm": 13.961121834810543, + "learning_rate": 3.0303030303030305e-07, + "logits/chosen": -0.10742922127246857, + "logits/rejected": -0.052524250000715256, + "logps/chosen": -1.6910076141357422, + "logps/rejected": -1.7397321462631226, + "loss": 0.9506, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.6910076141357422, + "rewards/margins": 0.04872459918260574, + "rewards/rejected": -1.7397321462631226, + "sft_loss": 1.5169093608856201, + "step": 170 + }, + { + "epoch": 0.09366114734905502, + "grad_norm": 10.42747131693837, + "learning_rate": 3.1194295900178254e-07, + "logits/chosen": 0.04428332671523094, + "logits/rejected": 0.042808979749679565, + "logps/chosen": -1.5634596347808838, + "logps/rejected": -1.6661045551300049, + "loss": 0.9397, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.5634596347808838, + "rewards/margins": 0.10264496505260468, + "rewards/rejected": -1.6661045551300049, + "sft_loss": 1.4734010696411133, + "step": 175 + }, + { + "epoch": 0.0963371801304566, + "grad_norm": 8.78321914975296, + "learning_rate": 3.2085561497326203e-07, + "logits/chosen": 0.012711775489151478, + "logits/rejected": 0.014043694362044334, + "logps/chosen": -1.5782628059387207, + "logps/rejected": -1.7380142211914062, + "loss": 0.9172, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.5782628059387207, + "rewards/margins": 0.15975116193294525, + "rewards/rejected": -1.7380142211914062, + "sft_loss": 1.4693598747253418, + "step": 180 + }, + { + "epoch": 0.09901321291185818, + "grad_norm": 8.723622382486115, + "learning_rate": 3.297682709447415e-07, + "logits/chosen": -0.15454931557178497, + "logits/rejected": -0.06420670449733734, + "logps/chosen": -1.5361636877059937, + "logps/rejected": -1.6049896478652954, + "loss": 0.9592, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.5361636877059937, + "rewards/margins": 0.06882590800523758, + "rewards/rejected": -1.6049896478652954, + "sft_loss": 1.4361121654510498, + "step": 185 + }, + { + "epoch": 0.10168924569325974, + "grad_norm": 9.379186782623458, + "learning_rate": 3.38680926916221e-07, + "logits/chosen": -0.06259158998727798, + "logits/rejected": 0.0626489445567131, + "logps/chosen": -1.5418685674667358, + "logps/rejected": -1.6475862264633179, + "loss": 0.8949, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.5418685674667358, + "rewards/margins": 0.1057177409529686, + "rewards/rejected": -1.6475862264633179, + "sft_loss": 1.4272186756134033, + "step": 190 + }, + { + "epoch": 0.10436527847466132, + "grad_norm": 6.312300436261769, + "learning_rate": 3.475935828877005e-07, + "logits/chosen": 0.009148378856480122, + "logits/rejected": 0.16754285991191864, + "logps/chosen": -1.3938941955566406, + "logps/rejected": -1.554271936416626, + "loss": 0.8745, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3938941955566406, + "rewards/margins": 0.16037783026695251, + "rewards/rejected": -1.554271936416626, + "sft_loss": 1.3546264171600342, + "step": 195 + }, + { + "epoch": 0.1070413112560629, + "grad_norm": 14.617155597119094, + "learning_rate": 3.5650623885918e-07, + "logits/chosen": -0.08748116344213486, + "logits/rejected": 0.05105700343847275, + "logps/chosen": -1.512632131576538, + "logps/rejected": -1.541290283203125, + "loss": 0.944, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.512632131576538, + "rewards/margins": 0.028658073395490646, + "rewards/rejected": -1.541290283203125, + "sft_loss": 1.4626529216766357, + "step": 200 + }, + { + "epoch": 0.10971734403746446, + "grad_norm": 14.83212977546729, + "learning_rate": 3.654188948306595e-07, + "logits/chosen": -0.060840390622615814, + "logits/rejected": 0.08631278574466705, + "logps/chosen": -1.433002233505249, + "logps/rejected": -1.4933949708938599, + "loss": 0.9357, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.433002233505249, + "rewards/margins": 0.060392655432224274, + "rewards/rejected": -1.4933949708938599, + "sft_loss": 1.3502676486968994, + "step": 205 + }, + { + "epoch": 0.11239337681886603, + "grad_norm": 12.073571859501529, + "learning_rate": 3.7433155080213904e-07, + "logits/chosen": -0.16160300374031067, + "logits/rejected": 0.02457263693213463, + "logps/chosen": -1.4971989393234253, + "logps/rejected": -1.664298415184021, + "loss": 0.8794, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4971989393234253, + "rewards/margins": 0.16709943115711212, + "rewards/rejected": -1.664298415184021, + "sft_loss": 1.4122134447097778, + "step": 210 + }, + { + "epoch": 0.1150694096002676, + "grad_norm": 8.053391971103682, + "learning_rate": 3.8324420677361853e-07, + "logits/chosen": -0.20213842391967773, + "logits/rejected": 0.04158937931060791, + "logps/chosen": -1.5002050399780273, + "logps/rejected": -1.5914572477340698, + "loss": 0.875, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.5002050399780273, + "rewards/margins": 0.09125222265720367, + "rewards/rejected": -1.5914572477340698, + "sft_loss": 1.4370949268341064, + "step": 215 + }, + { + "epoch": 0.11774544238166917, + "grad_norm": 17.92140582684194, + "learning_rate": 3.92156862745098e-07, + "logits/chosen": 0.04896925762295723, + "logits/rejected": 0.15036346018314362, + "logps/chosen": -1.4815274477005005, + "logps/rejected": -1.6879806518554688, + "loss": 0.8464, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4815274477005005, + "rewards/margins": 0.2064533233642578, + "rewards/rejected": -1.6879806518554688, + "sft_loss": 1.4312843084335327, + "step": 220 + }, + { + "epoch": 0.12042147516307075, + "grad_norm": 6.819571995837769, + "learning_rate": 4.010695187165775e-07, + "logits/chosen": -0.10190838575363159, + "logits/rejected": 0.0719117596745491, + "logps/chosen": -1.4662240743637085, + "logps/rejected": -1.6161525249481201, + "loss": 0.8542, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4662240743637085, + "rewards/margins": 0.1499285250902176, + "rewards/rejected": -1.6161525249481201, + "sft_loss": 1.3879258632659912, + "step": 225 + }, + { + "epoch": 0.12309750794447231, + "grad_norm": 6.60282038631885, + "learning_rate": 4.09982174688057e-07, + "logits/chosen": -0.038417570292949677, + "logits/rejected": 0.04065591096878052, + "logps/chosen": -1.5287741422653198, + "logps/rejected": -1.698678970336914, + "loss": 0.882, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.5287741422653198, + "rewards/margins": 0.16990481317043304, + "rewards/rejected": -1.698678970336914, + "sft_loss": 1.3734486103057861, + "step": 230 + }, + { + "epoch": 0.1257735407258739, + "grad_norm": 11.553765477929723, + "learning_rate": 4.188948306595365e-07, + "logits/chosen": 0.01179618202149868, + "logits/rejected": 0.15246328711509705, + "logps/chosen": -1.4749536514282227, + "logps/rejected": -1.6516231298446655, + "loss": 0.8382, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4749536514282227, + "rewards/margins": 0.17666950821876526, + "rewards/rejected": -1.6516231298446655, + "sft_loss": 1.3738255500793457, + "step": 235 + }, + { + "epoch": 0.12844957350727546, + "grad_norm": 7.820091999170089, + "learning_rate": 4.27807486631016e-07, + "logits/chosen": -0.04290889948606491, + "logits/rejected": 0.08651427924633026, + "logps/chosen": -1.4832552671432495, + "logps/rejected": -1.678340196609497, + "loss": 0.8709, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4832552671432495, + "rewards/margins": 0.19508466124534607, + "rewards/rejected": -1.678340196609497, + "sft_loss": 1.446438193321228, + "step": 240 + }, + { + "epoch": 0.13112560628867703, + "grad_norm": 7.853590240194409, + "learning_rate": 4.3672014260249554e-07, + "logits/chosen": 0.032018642872571945, + "logits/rejected": 0.1550910770893097, + "logps/chosen": -1.581587791442871, + "logps/rejected": -1.6717331409454346, + "loss": 0.8975, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.581587791442871, + "rewards/margins": 0.09014533460140228, + "rewards/rejected": -1.6717331409454346, + "sft_loss": 1.5218582153320312, + "step": 245 + }, + { + "epoch": 0.1338016390700786, + "grad_norm": 13.935759547676863, + "learning_rate": 4.4563279857397503e-07, + "logits/chosen": -0.06505148857831955, + "logits/rejected": 0.10135525465011597, + "logps/chosen": -1.5776017904281616, + "logps/rejected": -1.6428560018539429, + "loss": 0.9479, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5776017904281616, + "rewards/margins": 0.06525401026010513, + "rewards/rejected": -1.6428560018539429, + "sft_loss": 1.4219996929168701, + "step": 250 + }, + { + "epoch": 0.1364776718514802, + "grad_norm": 8.872910049052326, + "learning_rate": 4.545454545454545e-07, + "logits/chosen": -0.025789733976125717, + "logits/rejected": 0.12136568874120712, + "logps/chosen": -1.4498792886734009, + "logps/rejected": -1.6182514429092407, + "loss": 0.8624, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4498792886734009, + "rewards/margins": 0.16837215423583984, + "rewards/rejected": -1.6182514429092407, + "sft_loss": 1.334236741065979, + "step": 255 + }, + { + "epoch": 0.13915370463288176, + "grad_norm": 7.407385018535111, + "learning_rate": 4.63458110516934e-07, + "logits/chosen": -0.2350751906633377, + "logits/rejected": -0.12682147324085236, + "logps/chosen": -1.6020698547363281, + "logps/rejected": -1.7310168743133545, + "loss": 0.8466, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.6020698547363281, + "rewards/margins": 0.12894682586193085, + "rewards/rejected": -1.7310168743133545, + "sft_loss": 1.4834848642349243, + "step": 260 + }, + { + "epoch": 0.1418297374142833, + "grad_norm": 15.909087871185044, + "learning_rate": 4.723707664884135e-07, + "logits/chosen": -0.0795278325676918, + "logits/rejected": 0.006105656735599041, + "logps/chosen": -1.6134687662124634, + "logps/rejected": -1.7488367557525635, + "loss": 0.8955, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.6134687662124634, + "rewards/margins": 0.13536801934242249, + "rewards/rejected": -1.7488367557525635, + "sft_loss": 1.5514605045318604, + "step": 265 + }, + { + "epoch": 0.1445057701956849, + "grad_norm": 6.423043314074773, + "learning_rate": 4.81283422459893e-07, + "logits/chosen": -0.0875733345746994, + "logits/rejected": 0.049231600016355515, + "logps/chosen": -1.5094425678253174, + "logps/rejected": -1.632433295249939, + "loss": 0.8701, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.5094425678253174, + "rewards/margins": 0.12299074977636337, + "rewards/rejected": -1.632433295249939, + "sft_loss": 1.4312238693237305, + "step": 270 + }, + { + "epoch": 0.14718180297708647, + "grad_norm": 8.545207155313289, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": -0.03895934298634529, + "logits/rejected": 0.05629078298807144, + "logps/chosen": -1.4638268947601318, + "logps/rejected": -1.689480185508728, + "loss": 0.8625, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4638268947601318, + "rewards/margins": 0.2256532907485962, + "rewards/rejected": -1.689480185508728, + "sft_loss": 1.37359619140625, + "step": 275 + }, + { + "epoch": 0.14985783575848804, + "grad_norm": 15.708485896926108, + "learning_rate": 4.99108734402852e-07, + "logits/chosen": -0.10516306012868881, + "logits/rejected": 0.05365392565727234, + "logps/chosen": -1.5601650476455688, + "logps/rejected": -1.680772066116333, + "loss": 0.8802, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.5601650476455688, + "rewards/margins": 0.12060710042715073, + "rewards/rejected": -1.680772066116333, + "sft_loss": 1.4628015756607056, + "step": 280 + }, + { + "epoch": 0.15253386853988962, + "grad_norm": 7.9296611950593885, + "learning_rate": 5.080213903743315e-07, + "logits/chosen": -0.059190742671489716, + "logits/rejected": 0.08300630003213882, + "logps/chosen": -1.5395736694335938, + "logps/rejected": -1.6552746295928955, + "loss": 0.9105, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.5395736694335938, + "rewards/margins": 0.11570099741220474, + "rewards/rejected": -1.6552746295928955, + "sft_loss": 1.4917641878128052, + "step": 285 + }, + { + "epoch": 0.1552099013212912, + "grad_norm": 8.512531880461733, + "learning_rate": 5.169340463458111e-07, + "logits/chosen": -0.14478448033332825, + "logits/rejected": 0.14671705663204193, + "logps/chosen": -1.5264160633087158, + "logps/rejected": -1.7059307098388672, + "loss": 0.8277, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.5264160633087158, + "rewards/margins": 0.1795145720243454, + "rewards/rejected": -1.7059307098388672, + "sft_loss": 1.465397834777832, + "step": 290 + }, + { + "epoch": 0.15788593410269275, + "grad_norm": 11.31996235651409, + "learning_rate": 5.258467023172905e-07, + "logits/chosen": -0.0691608339548111, + "logits/rejected": -0.006187940947711468, + "logps/chosen": -1.4758808612823486, + "logps/rejected": -1.6043269634246826, + "loss": 0.8655, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4758808612823486, + "rewards/margins": 0.1284460872411728, + "rewards/rejected": -1.6043269634246826, + "sft_loss": 1.3975608348846436, + "step": 295 + }, + { + "epoch": 0.16056196688409433, + "grad_norm": 9.416282928218743, + "learning_rate": 5.347593582887701e-07, + "logits/chosen": -0.07168702781200409, + "logits/rejected": 0.10054339468479156, + "logps/chosen": -1.5283973217010498, + "logps/rejected": -1.6419986486434937, + "loss": 0.8884, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.5283973217010498, + "rewards/margins": 0.11360123008489609, + "rewards/rejected": -1.6419986486434937, + "sft_loss": 1.483135461807251, + "step": 300 + }, + { + "epoch": 0.1632379996654959, + "grad_norm": 7.35648695914466, + "learning_rate": 5.436720142602496e-07, + "logits/chosen": -0.009979024529457092, + "logits/rejected": 0.06142239645123482, + "logps/chosen": -1.6317169666290283, + "logps/rejected": -1.6297073364257812, + "loss": 0.9396, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -1.6317169666290283, + "rewards/margins": -0.0020095347426831722, + "rewards/rejected": -1.6297073364257812, + "sft_loss": 1.5488479137420654, + "step": 305 + }, + { + "epoch": 0.16591403244689748, + "grad_norm": 10.439877376996783, + "learning_rate": 5.52584670231729e-07, + "logits/chosen": -0.18711742758750916, + "logits/rejected": -0.08916506916284561, + "logps/chosen": -1.6041101217269897, + "logps/rejected": -1.719856858253479, + "loss": 0.8965, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.6041101217269897, + "rewards/margins": 0.11574681848287582, + "rewards/rejected": -1.719856858253479, + "sft_loss": 1.5262537002563477, + "step": 310 + }, + { + "epoch": 0.16859006522829906, + "grad_norm": 12.40055960906028, + "learning_rate": 5.614973262032086e-07, + "logits/chosen": -0.015852855518460274, + "logits/rejected": 0.14568349719047546, + "logps/chosen": -1.6016238927841187, + "logps/rejected": -1.7875289916992188, + "loss": 0.8617, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.6016238927841187, + "rewards/margins": 0.18590494990348816, + "rewards/rejected": -1.7875289916992188, + "sft_loss": 1.5366556644439697, + "step": 315 + }, + { + "epoch": 0.1712660980097006, + "grad_norm": 6.813830943157419, + "learning_rate": 5.70409982174688e-07, + "logits/chosen": -0.05537568777799606, + "logits/rejected": 0.07943960279226303, + "logps/chosen": -1.5311832427978516, + "logps/rejected": -1.5911250114440918, + "loss": 0.9035, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.5311832427978516, + "rewards/margins": 0.05994180589914322, + "rewards/rejected": -1.5911250114440918, + "sft_loss": 1.47402822971344, + "step": 320 + }, + { + "epoch": 0.17394213079110218, + "grad_norm": 10.004743649510763, + "learning_rate": 5.793226381461676e-07, + "logits/chosen": -0.11411911249160767, + "logits/rejected": 0.006765827536582947, + "logps/chosen": -1.557644248008728, + "logps/rejected": -1.8630332946777344, + "loss": 0.8201, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.557644248008728, + "rewards/margins": 0.3053889870643616, + "rewards/rejected": -1.8630332946777344, + "sft_loss": 1.547539234161377, + "step": 325 + }, + { + "epoch": 0.17661816357250376, + "grad_norm": 12.547787288359798, + "learning_rate": 5.88235294117647e-07, + "logits/chosen": -0.013900229707360268, + "logits/rejected": 0.14141085743904114, + "logps/chosen": -1.5468895435333252, + "logps/rejected": -1.8233592510223389, + "loss": 0.8041, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5468895435333252, + "rewards/margins": 0.2764698565006256, + "rewards/rejected": -1.8233592510223389, + "sft_loss": 1.4977320432662964, + "step": 330 + }, + { + "epoch": 0.17929419635390534, + "grad_norm": 19.340150760011156, + "learning_rate": 5.971479500891266e-07, + "logits/chosen": 0.04217713326215744, + "logits/rejected": 0.15277662873268127, + "logps/chosen": -1.6085857152938843, + "logps/rejected": -1.6779794692993164, + "loss": 0.8962, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.6085857152938843, + "rewards/margins": 0.06939395517110825, + "rewards/rejected": -1.6779794692993164, + "sft_loss": 1.4822218418121338, + "step": 335 + }, + { + "epoch": 0.18197022913530692, + "grad_norm": 14.640742158325542, + "learning_rate": 6.060606060606061e-07, + "logits/chosen": -0.02979375422000885, + "logits/rejected": 0.12206296622753143, + "logps/chosen": -1.6841856241226196, + "logps/rejected": -1.7910856008529663, + "loss": 0.9172, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6841856241226196, + "rewards/margins": 0.10689990222454071, + "rewards/rejected": -1.7910856008529663, + "sft_loss": 1.5484795570373535, + "step": 340 + }, + { + "epoch": 0.1846462619167085, + "grad_norm": 12.93329168947757, + "learning_rate": 6.149732620320855e-07, + "logits/chosen": 0.05653851106762886, + "logits/rejected": 0.08597005903720856, + "logps/chosen": -1.5816501379013062, + "logps/rejected": -1.7690460681915283, + "loss": 0.865, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.5816501379013062, + "rewards/margins": 0.1873955875635147, + "rewards/rejected": -1.7690460681915283, + "sft_loss": 1.5128852128982544, + "step": 345 + }, + { + "epoch": 0.18732229469811004, + "grad_norm": 11.403163971079575, + "learning_rate": 6.238859180035651e-07, + "logits/chosen": -0.023158203810453415, + "logits/rejected": 0.070060595870018, + "logps/chosen": -1.537097692489624, + "logps/rejected": -1.6681251525878906, + "loss": 0.8959, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.537097692489624, + "rewards/margins": 0.13102751970291138, + "rewards/rejected": -1.6681251525878906, + "sft_loss": 1.4899991750717163, + "step": 350 + }, + { + "epoch": 0.18999832747951162, + "grad_norm": 10.269830532082047, + "learning_rate": 6.327985739750445e-07, + "logits/chosen": -0.09361349791288376, + "logits/rejected": 0.12785111367702484, + "logps/chosen": -1.6137135028839111, + "logps/rejected": -1.6899824142456055, + "loss": 0.9026, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.6137135028839111, + "rewards/margins": 0.07626868039369583, + "rewards/rejected": -1.6899824142456055, + "sft_loss": 1.5542749166488647, + "step": 355 + }, + { + "epoch": 0.1926743602609132, + "grad_norm": 9.251468349092507, + "learning_rate": 6.417112299465241e-07, + "logits/chosen": -0.06010516732931137, + "logits/rejected": 0.021687399595975876, + "logps/chosen": -1.5847867727279663, + "logps/rejected": -1.755059003829956, + "loss": 0.8823, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.5847867727279663, + "rewards/margins": 0.17027229070663452, + "rewards/rejected": -1.755059003829956, + "sft_loss": 1.4727002382278442, + "step": 360 + }, + { + "epoch": 0.19535039304231477, + "grad_norm": 14.888840674530762, + "learning_rate": 6.506238859180035e-07, + "logits/chosen": 0.00939253717660904, + "logits/rejected": 0.09567335247993469, + "logps/chosen": -1.546452283859253, + "logps/rejected": -1.636813759803772, + "loss": 0.9025, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.546452283859253, + "rewards/margins": 0.09036144614219666, + "rewards/rejected": -1.636813759803772, + "sft_loss": 1.4358354806900024, + "step": 365 + }, + { + "epoch": 0.19802642582371635, + "grad_norm": 12.156898971625248, + "learning_rate": 6.59536541889483e-07, + "logits/chosen": -0.028807152062654495, + "logits/rejected": 0.061083655804395676, + "logps/chosen": -1.530022382736206, + "logps/rejected": -1.6157872676849365, + "loss": 0.9193, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.530022382736206, + "rewards/margins": 0.08576497435569763, + "rewards/rejected": -1.6157872676849365, + "sft_loss": 1.4403166770935059, + "step": 370 + }, + { + "epoch": 0.2007024586051179, + "grad_norm": 11.627248013410282, + "learning_rate": 6.684491978609626e-07, + "logits/chosen": -0.05842500180006027, + "logits/rejected": 0.09691628813743591, + "logps/chosen": -1.5447757244110107, + "logps/rejected": -1.7703924179077148, + "loss": 0.8379, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5447757244110107, + "rewards/margins": 0.22561664879322052, + "rewards/rejected": -1.7703924179077148, + "sft_loss": 1.4894663095474243, + "step": 375 + }, + { + "epoch": 0.20337849138651948, + "grad_norm": 10.537843750135279, + "learning_rate": 6.77361853832442e-07, + "logits/chosen": -0.03339865058660507, + "logits/rejected": 0.054922886192798615, + "logps/chosen": -1.6277233362197876, + "logps/rejected": -1.8356781005859375, + "loss": 0.8116, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.6277233362197876, + "rewards/margins": 0.20795480906963348, + "rewards/rejected": -1.8356781005859375, + "sft_loss": 1.512076497077942, + "step": 380 + }, + { + "epoch": 0.20605452416792105, + "grad_norm": 5.665583910535334, + "learning_rate": 6.862745098039216e-07, + "logits/chosen": 0.006865252740681171, + "logits/rejected": 0.08682769536972046, + "logps/chosen": -1.678415060043335, + "logps/rejected": -1.740708351135254, + "loss": 0.8908, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.678415060043335, + "rewards/margins": 0.06229352951049805, + "rewards/rejected": -1.740708351135254, + "sft_loss": 1.5830042362213135, + "step": 385 + }, + { + "epoch": 0.20873055694932263, + "grad_norm": 10.41529627121867, + "learning_rate": 6.95187165775401e-07, + "logits/chosen": 0.038954682648181915, + "logits/rejected": 0.19942323863506317, + "logps/chosen": -1.7041614055633545, + "logps/rejected": -1.8604240417480469, + "loss": 0.8541, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.7041614055633545, + "rewards/margins": 0.15626260638237, + "rewards/rejected": -1.8604240417480469, + "sft_loss": 1.5955750942230225, + "step": 390 + }, + { + "epoch": 0.2114065897307242, + "grad_norm": 10.077546406750734, + "learning_rate": 7.040998217468806e-07, + "logits/chosen": -0.07024117559194565, + "logits/rejected": 0.08584646135568619, + "logps/chosen": -1.6505581140518188, + "logps/rejected": -1.7517807483673096, + "loss": 0.8694, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.6505581140518188, + "rewards/margins": 0.1012226939201355, + "rewards/rejected": -1.7517807483673096, + "sft_loss": 1.5330431461334229, + "step": 395 + }, + { + "epoch": 0.2140826225121258, + "grad_norm": 8.621579993141502, + "learning_rate": 7.1301247771836e-07, + "logits/chosen": 0.07951502501964569, + "logits/rejected": 0.17376390099525452, + "logps/chosen": -1.7101367712020874, + "logps/rejected": -1.8504247665405273, + "loss": 0.839, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.7101367712020874, + "rewards/margins": 0.14028772711753845, + "rewards/rejected": -1.8504247665405273, + "sft_loss": 1.5114778280258179, + "step": 400 + }, + { + "epoch": 0.2140826225121258, + "eval_logits/chosen": 0.28865498304367065, + "eval_logits/rejected": 0.37876713275909424, + "eval_logps/chosen": -1.7061017751693726, + "eval_logps/rejected": -1.9017139673233032, + "eval_loss": 0.8444245457649231, + "eval_rewards/accuracies": 0.5563797950744629, + "eval_rewards/chosen": -1.7061017751693726, + "eval_rewards/margins": 0.1956121325492859, + "eval_rewards/rejected": -1.9017139673233032, + "eval_runtime": 43.5944, + "eval_samples_per_second": 30.853, + "eval_sft_loss": 1.5361721515655518, + "eval_steps_per_second": 7.73, + "step": 400 + }, + { + "epoch": 0.21675865529352734, + "grad_norm": 8.801414813330842, + "learning_rate": 7.219251336898395e-07, + "logits/chosen": 0.012902075424790382, + "logits/rejected": 0.10784679651260376, + "logps/chosen": -1.7358152866363525, + "logps/rejected": -1.8969757556915283, + "loss": 0.8908, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.7358152866363525, + "rewards/margins": 0.1611604392528534, + "rewards/rejected": -1.8969757556915283, + "sft_loss": 1.5918419361114502, + "step": 405 + }, + { + "epoch": 0.2194346880749289, + "grad_norm": 12.045545052797944, + "learning_rate": 7.30837789661319e-07, + "logits/chosen": 0.05082521587610245, + "logits/rejected": 0.18324458599090576, + "logps/chosen": -1.6618913412094116, + "logps/rejected": -1.844369888305664, + "loss": 0.8491, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.6618913412094116, + "rewards/margins": 0.18247857689857483, + "rewards/rejected": -1.844369888305664, + "sft_loss": 1.5552400350570679, + "step": 410 + }, + { + "epoch": 0.2221107208563305, + "grad_norm": 8.576746510646535, + "learning_rate": 7.397504456327985e-07, + "logits/chosen": 0.03030974604189396, + "logits/rejected": 0.07934688031673431, + "logps/chosen": -1.669297218322754, + "logps/rejected": -1.8434665203094482, + "loss": 0.8484, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.669297218322754, + "rewards/margins": 0.17416930198669434, + "rewards/rejected": -1.8434665203094482, + "sft_loss": 1.5341603755950928, + "step": 415 + }, + { + "epoch": 0.22478675363773207, + "grad_norm": 10.63547091961648, + "learning_rate": 7.486631016042781e-07, + "logits/chosen": 0.008567921817302704, + "logits/rejected": 0.2103685885667801, + "logps/chosen": -1.561725378036499, + "logps/rejected": -1.7165727615356445, + "loss": 0.8708, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.561725378036499, + "rewards/margins": 0.15484726428985596, + "rewards/rejected": -1.7165727615356445, + "sft_loss": 1.4999353885650635, + "step": 420 + }, + { + "epoch": 0.22746278641913364, + "grad_norm": 9.932013113637575, + "learning_rate": 7.575757575757575e-07, + "logits/chosen": -0.008445126004517078, + "logits/rejected": 0.1929190754890442, + "logps/chosen": -1.6287310123443604, + "logps/rejected": -1.8927959203720093, + "loss": 0.7954, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6287310123443604, + "rewards/margins": 0.26406508684158325, + "rewards/rejected": -1.8927959203720093, + "sft_loss": 1.5982462167739868, + "step": 425 + }, + { + "epoch": 0.2301388192005352, + "grad_norm": 8.876616130983669, + "learning_rate": 7.664884135472371e-07, + "logits/chosen": -0.04979996010661125, + "logits/rejected": 0.1489320695400238, + "logps/chosen": -1.6180919408798218, + "logps/rejected": -1.9462039470672607, + "loss": 0.8028, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.6180919408798218, + "rewards/margins": 0.3281119763851166, + "rewards/rejected": -1.9462039470672607, + "sft_loss": 1.580506682395935, + "step": 430 + }, + { + "epoch": 0.23281485198193677, + "grad_norm": 20.45419933146445, + "learning_rate": 7.754010695187165e-07, + "logits/chosen": 0.02379441447556019, + "logits/rejected": 0.11689828336238861, + "logps/chosen": -1.5695067644119263, + "logps/rejected": -1.7231642007827759, + "loss": 0.8429, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.5695067644119263, + "rewards/margins": 0.15365752577781677, + "rewards/rejected": -1.7231642007827759, + "sft_loss": 1.517533540725708, + "step": 435 + }, + { + "epoch": 0.23549088476333835, + "grad_norm": 13.65531705237304, + "learning_rate": 7.84313725490196e-07, + "logits/chosen": 0.013252335600554943, + "logits/rejected": 0.10951529443264008, + "logps/chosen": -1.6240532398223877, + "logps/rejected": -1.825933814048767, + "loss": 0.8266, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6240532398223877, + "rewards/margins": 0.201880544424057, + "rewards/rejected": -1.825933814048767, + "sft_loss": 1.5613758563995361, + "step": 440 + }, + { + "epoch": 0.23816691754473993, + "grad_norm": 12.24154907085509, + "learning_rate": 7.932263814616755e-07, + "logits/chosen": -0.023964716121554375, + "logits/rejected": 0.09082510322332382, + "logps/chosen": -1.6749906539916992, + "logps/rejected": -1.9429988861083984, + "loss": 0.8244, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6749906539916992, + "rewards/margins": 0.26800835132598877, + "rewards/rejected": -1.9429988861083984, + "sft_loss": 1.583179235458374, + "step": 445 + }, + { + "epoch": 0.2408429503261415, + "grad_norm": 14.76664890654527, + "learning_rate": 8.02139037433155e-07, + "logits/chosen": 0.030245855450630188, + "logits/rejected": 0.1601337343454361, + "logps/chosen": -1.7110360860824585, + "logps/rejected": -1.9090163707733154, + "loss": 0.8058, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.7110360860824585, + "rewards/margins": 0.1979803740978241, + "rewards/rejected": -1.9090163707733154, + "sft_loss": 1.5668268203735352, + "step": 450 + }, + { + "epoch": 0.24351898310754308, + "grad_norm": 19.790468019674638, + "learning_rate": 8.110516934046346e-07, + "logits/chosen": -0.006326199974864721, + "logits/rejected": 0.07478408515453339, + "logps/chosen": -1.6535755395889282, + "logps/rejected": -1.9522926807403564, + "loss": 0.7848, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6535755395889282, + "rewards/margins": 0.2987171709537506, + "rewards/rejected": -1.9522926807403564, + "sft_loss": 1.537989616394043, + "step": 455 + }, + { + "epoch": 0.24619501588894463, + "grad_norm": 9.616950707647714, + "learning_rate": 8.19964349376114e-07, + "logits/chosen": -0.11517757177352905, + "logits/rejected": 0.003994188271462917, + "logps/chosen": -1.746042013168335, + "logps/rejected": -1.8898332118988037, + "loss": 0.8426, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.746042013168335, + "rewards/margins": 0.1437911093235016, + "rewards/rejected": -1.8898332118988037, + "sft_loss": 1.6806637048721313, + "step": 460 + }, + { + "epoch": 0.2488710486703462, + "grad_norm": 11.900667307712698, + "learning_rate": 8.288770053475936e-07, + "logits/chosen": 0.14643609523773193, + "logits/rejected": 0.16485807299613953, + "logps/chosen": -1.7409279346466064, + "logps/rejected": -1.9578319787979126, + "loss": 0.8387, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.7409279346466064, + "rewards/margins": 0.21690388023853302, + "rewards/rejected": -1.9578319787979126, + "sft_loss": 1.6416947841644287, + "step": 465 + }, + { + "epoch": 0.2515470814517478, + "grad_norm": 8.830181983289117, + "learning_rate": 8.37789661319073e-07, + "logits/chosen": 0.1644625961780548, + "logits/rejected": 0.12652386724948883, + "logps/chosen": -1.6838014125823975, + "logps/rejected": -1.9070326089859009, + "loss": 0.8349, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.6838014125823975, + "rewards/margins": 0.22323103249073029, + "rewards/rejected": -1.9070326089859009, + "sft_loss": 1.596195101737976, + "step": 470 + }, + { + "epoch": 0.25422311423314936, + "grad_norm": 7.995168104534913, + "learning_rate": 8.467023172905525e-07, + "logits/chosen": -0.024302974343299866, + "logits/rejected": 0.11604616791009903, + "logps/chosen": -1.6668624877929688, + "logps/rejected": -2.0832455158233643, + "loss": 0.7536, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6668624877929688, + "rewards/margins": 0.4163830280303955, + "rewards/rejected": -2.0832455158233643, + "sft_loss": 1.625457763671875, + "step": 475 + }, + { + "epoch": 0.2568991470145509, + "grad_norm": 13.35967258064493, + "learning_rate": 8.55614973262032e-07, + "logits/chosen": 0.0026796311140060425, + "logits/rejected": 0.19870570302009583, + "logps/chosen": -1.6937164068222046, + "logps/rejected": -1.8854515552520752, + "loss": 0.822, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.6937164068222046, + "rewards/margins": 0.19173488020896912, + "rewards/rejected": -1.8854515552520752, + "sft_loss": 1.6137354373931885, + "step": 480 + }, + { + "epoch": 0.2595751797959525, + "grad_norm": 14.777387782967626, + "learning_rate": 8.645276292335115e-07, + "logits/chosen": 0.02986701764166355, + "logits/rejected": 0.07667236030101776, + "logps/chosen": -1.900377631187439, + "logps/rejected": -2.0468382835388184, + "loss": 0.8557, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.900377631187439, + "rewards/margins": 0.14646077156066895, + "rewards/rejected": -2.0468382835388184, + "sft_loss": 1.723910927772522, + "step": 485 + }, + { + "epoch": 0.26225121257735406, + "grad_norm": 9.826571934223194, + "learning_rate": 8.734402852049911e-07, + "logits/chosen": 0.05959752947092056, + "logits/rejected": 0.12927241623401642, + "logps/chosen": -1.8736746311187744, + "logps/rejected": -2.0128977298736572, + "loss": 0.8551, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.8736746311187744, + "rewards/margins": 0.13922320306301117, + "rewards/rejected": -2.0128977298736572, + "sft_loss": 1.7088829278945923, + "step": 490 + }, + { + "epoch": 0.26492724535875567, + "grad_norm": 12.412657506346838, + "learning_rate": 8.823529411764705e-07, + "logits/chosen": 0.02171451412141323, + "logits/rejected": 0.05109437555074692, + "logps/chosen": -1.885629653930664, + "logps/rejected": -2.019071578979492, + "loss": 0.8437, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.885629653930664, + "rewards/margins": 0.13344189524650574, + "rewards/rejected": -2.019071578979492, + "sft_loss": 1.7931492328643799, + "step": 495 + }, + { + "epoch": 0.2676032781401572, + "grad_norm": 9.482682538601056, + "learning_rate": 8.912655971479501e-07, + "logits/chosen": 0.008008052594959736, + "logits/rejected": 0.10441839694976807, + "logps/chosen": -1.82148015499115, + "logps/rejected": -2.05816912651062, + "loss": 0.809, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.82148015499115, + "rewards/margins": 0.23668909072875977, + "rewards/rejected": -2.05816912651062, + "sft_loss": 1.700110673904419, + "step": 500 + }, + { + "epoch": 0.27027931092155877, + "grad_norm": 12.48443598930354, + "learning_rate": 9.001782531194295e-07, + "logits/chosen": 0.012660378590226173, + "logits/rejected": 0.16065539419651031, + "logps/chosen": -1.8914144039154053, + "logps/rejected": -2.045323610305786, + "loss": 0.8174, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.8914144039154053, + "rewards/margins": 0.1539088636636734, + "rewards/rejected": -2.045323610305786, + "sft_loss": 1.7708075046539307, + "step": 505 + }, + { + "epoch": 0.2729553437029604, + "grad_norm": 9.036453223871478, + "learning_rate": 9.09090909090909e-07, + "logits/chosen": 0.1636802852153778, + "logits/rejected": 0.21730521321296692, + "logps/chosen": -1.868528127670288, + "logps/rejected": -2.1642730236053467, + "loss": 0.7701, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.868528127670288, + "rewards/margins": 0.2957451343536377, + "rewards/rejected": -2.1642730236053467, + "sft_loss": 1.730780839920044, + "step": 510 + }, + { + "epoch": 0.2756313764843619, + "grad_norm": 10.891419449579097, + "learning_rate": 9.180035650623885e-07, + "logits/chosen": 0.14418451488018036, + "logits/rejected": 0.25062134861946106, + "logps/chosen": -1.7637317180633545, + "logps/rejected": -2.0156896114349365, + "loss": 0.7843, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.7637317180633545, + "rewards/margins": 0.2519580125808716, + "rewards/rejected": -2.0156896114349365, + "sft_loss": 1.6884260177612305, + "step": 515 + }, + { + "epoch": 0.27830740926576353, + "grad_norm": 9.775456886285932, + "learning_rate": 9.26916221033868e-07, + "logits/chosen": 0.039710883051157, + "logits/rejected": 0.17652074992656708, + "logps/chosen": -1.8773629665374756, + "logps/rejected": -2.135971784591675, + "loss": 0.7854, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.8773629665374756, + "rewards/margins": 0.25860869884490967, + "rewards/rejected": -2.135971784591675, + "sft_loss": 1.8785078525543213, + "step": 520 + }, + { + "epoch": 0.2809834420471651, + "grad_norm": 17.399966824971457, + "learning_rate": 9.358288770053476e-07, + "logits/chosen": 0.20017535984516144, + "logits/rejected": 0.2788700759410858, + "logps/chosen": -1.9711978435516357, + "logps/rejected": -2.333888530731201, + "loss": 0.7413, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.9711978435516357, + "rewards/margins": 0.3626905083656311, + "rewards/rejected": -2.333888530731201, + "sft_loss": 1.9129832983016968, + "step": 525 + }, + { + "epoch": 0.2836594748285666, + "grad_norm": 16.108980962918235, + "learning_rate": 9.44741532976827e-07, + "logits/chosen": 0.18143267929553986, + "logits/rejected": 0.2672853469848633, + "logps/chosen": -1.9471544027328491, + "logps/rejected": -2.2187883853912354, + "loss": 0.7997, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.9471544027328491, + "rewards/margins": 0.2716338634490967, + "rewards/rejected": -2.2187883853912354, + "sft_loss": 1.801098108291626, + "step": 530 + }, + { + "epoch": 0.28633550760996823, + "grad_norm": 14.859693867200447, + "learning_rate": 9.536541889483066e-07, + "logits/chosen": -0.002810171339660883, + "logits/rejected": 0.24968405067920685, + "logps/chosen": -1.9894745349884033, + "logps/rejected": -2.2667486667633057, + "loss": 0.737, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.9894745349884033, + "rewards/margins": 0.27727416157722473, + "rewards/rejected": -2.2667486667633057, + "sft_loss": 1.834472894668579, + "step": 535 + }, + { + "epoch": 0.2890115403913698, + "grad_norm": 13.266783239205989, + "learning_rate": 9.62566844919786e-07, + "logits/chosen": 0.11695842444896698, + "logits/rejected": 0.1895637810230255, + "logps/chosen": -2.2287280559539795, + "logps/rejected": -2.49006986618042, + "loss": 0.781, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.2287280559539795, + "rewards/margins": 0.2613416314125061, + "rewards/rejected": -2.49006986618042, + "sft_loss": 2.1219325065612793, + "step": 540 + }, + { + "epoch": 0.2916875731727714, + "grad_norm": 10.108061461389307, + "learning_rate": 9.714795008912655e-07, + "logits/chosen": 0.00664617121219635, + "logits/rejected": 0.19402989745140076, + "logps/chosen": -2.17495059967041, + "logps/rejected": -2.560760974884033, + "loss": 0.6977, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.17495059967041, + "rewards/margins": 0.38581031560897827, + "rewards/rejected": -2.560760974884033, + "sft_loss": 2.0547683238983154, + "step": 545 + }, + { + "epoch": 0.29436360595417294, + "grad_norm": 11.156872123357896, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": 0.14837445318698883, + "logits/rejected": 0.21960575878620148, + "logps/chosen": -2.2893216609954834, + "logps/rejected": -2.6165287494659424, + "loss": 0.7158, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.2893216609954834, + "rewards/margins": 0.32720714807510376, + "rewards/rejected": -2.6165287494659424, + "sft_loss": 2.1706032752990723, + "step": 550 + }, + { + "epoch": 0.2970396387355745, + "grad_norm": 21.327228435759405, + "learning_rate": 9.893048128342244e-07, + "logits/chosen": 0.0808831974864006, + "logits/rejected": 0.19863127171993256, + "logps/chosen": -2.471397876739502, + "logps/rejected": -2.6428585052490234, + "loss": 0.7911, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.471397876739502, + "rewards/margins": 0.171460822224617, + "rewards/rejected": -2.6428585052490234, + "sft_loss": 2.363668441772461, + "step": 555 + }, + { + "epoch": 0.2997156715169761, + "grad_norm": 15.817975460786185, + "learning_rate": 9.98217468805704e-07, + "logits/chosen": 0.17479531466960907, + "logits/rejected": 0.19065909087657928, + "logps/chosen": -2.38142728805542, + "logps/rejected": -2.700078248977661, + "loss": 0.7215, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.38142728805542, + "rewards/margins": 0.31865087151527405, + "rewards/rejected": -2.700078248977661, + "sft_loss": 2.4128060340881348, + "step": 560 + }, + { + "epoch": 0.30239170429837764, + "grad_norm": 11.167026754067493, + "learning_rate": 9.999984476788462e-07, + "logits/chosen": 0.15115851163864136, + "logits/rejected": 0.2050451785326004, + "logps/chosen": -2.5232253074645996, + "logps/rejected": -2.9136128425598145, + "loss": 0.6758, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.5232253074645996, + "rewards/margins": 0.39038750529289246, + "rewards/rejected": -2.9136128425598145, + "sft_loss": 2.4833340644836426, + "step": 565 + }, + { + "epoch": 0.30506773707977924, + "grad_norm": 29.251751625509854, + "learning_rate": 9.999921413906797e-07, + "logits/chosen": 0.06527841836214066, + "logits/rejected": 0.26166507601737976, + "logps/chosen": -2.654595375061035, + "logps/rejected": -3.0485172271728516, + "loss": 0.6882, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.654595375061035, + "rewards/margins": 0.39392179250717163, + "rewards/rejected": -3.0485172271728516, + "sft_loss": 2.623623847961426, + "step": 570 + }, + { + "epoch": 0.3077437698611808, + "grad_norm": 14.777746420060607, + "learning_rate": 9.999809841765644e-07, + "logits/chosen": 0.10745315253734589, + "logits/rejected": 0.16344238817691803, + "logps/chosen": -2.7796332836151123, + "logps/rejected": -3.1552608013153076, + "loss": 0.7132, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.7796332836151123, + "rewards/margins": 0.37562793493270874, + "rewards/rejected": -3.1552608013153076, + "sft_loss": 2.741098403930664, + "step": 575 + }, + { + "epoch": 0.3104198026425824, + "grad_norm": 12.739144398411966, + "learning_rate": 9.999649761447477e-07, + "logits/chosen": 0.09595675766468048, + "logits/rejected": 0.24499674141407013, + "logps/chosen": -2.9956271648406982, + "logps/rejected": -3.428938627243042, + "loss": 0.6871, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.9956271648406982, + "rewards/margins": 0.43331179022789, + "rewards/rejected": -3.428938627243042, + "sft_loss": 2.928342819213867, + "step": 580 + }, + { + "epoch": 0.31309583542398395, + "grad_norm": 17.255767939413705, + "learning_rate": 9.999441174505398e-07, + "logits/chosen": 0.07325030118227005, + "logits/rejected": 0.1624840795993805, + "logps/chosen": -3.4379382133483887, + "logps/rejected": -3.6763598918914795, + "loss": 0.79, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.4379382133483887, + "rewards/margins": 0.23842184245586395, + "rewards/rejected": -3.6763598918914795, + "sft_loss": 3.286072254180908, + "step": 585 + }, + { + "epoch": 0.3157718682053855, + "grad_norm": 30.110227167602943, + "learning_rate": 9.999184082963116e-07, + "logits/chosen": 0.10350818932056427, + "logits/rejected": 0.2149147093296051, + "logps/chosen": -3.244036912918091, + "logps/rejected": -3.4989047050476074, + "loss": 0.7593, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.244036912918091, + "rewards/margins": 0.25486817955970764, + "rewards/rejected": -3.4989047050476074, + "sft_loss": 3.1437976360321045, + "step": 590 + }, + { + "epoch": 0.3184479009867871, + "grad_norm": 18.945077633365436, + "learning_rate": 9.998878489314937e-07, + "logits/chosen": 0.1482018530368805, + "logits/rejected": 0.2577352523803711, + "logps/chosen": -2.873711585998535, + "logps/rejected": -3.2073111534118652, + "loss": 0.7128, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.873711585998535, + "rewards/margins": 0.3336000144481659, + "rewards/rejected": -3.2073111534118652, + "sft_loss": 2.8837180137634277, + "step": 595 + }, + { + "epoch": 0.32112393376818865, + "grad_norm": 9.958916715868252, + "learning_rate": 9.99852439652573e-07, + "logits/chosen": 0.12357846647500992, + "logits/rejected": 0.2520459294319153, + "logps/chosen": -2.982365131378174, + "logps/rejected": -3.3198161125183105, + "loss": 0.6921, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.982365131378174, + "rewards/margins": 0.33745133876800537, + "rewards/rejected": -3.3198161125183105, + "sft_loss": 2.9125170707702637, + "step": 600 + }, + { + "epoch": 0.32379996654959026, + "grad_norm": 16.38786490932435, + "learning_rate": 9.998121808030904e-07, + "logits/chosen": 0.08862508088350296, + "logits/rejected": 0.17105832695960999, + "logps/chosen": -3.2011170387268066, + "logps/rejected": -3.5023789405822754, + "loss": 0.753, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.2011170387268066, + "rewards/margins": 0.3012619912624359, + "rewards/rejected": -3.5023789405822754, + "sft_loss": 3.0842275619506836, + "step": 605 + }, + { + "epoch": 0.3264759993309918, + "grad_norm": 34.784158500457494, + "learning_rate": 9.997670727736379e-07, + "logits/chosen": 0.14439894258975983, + "logits/rejected": 0.30359843373298645, + "logps/chosen": -3.0514113903045654, + "logps/rejected": -3.3972702026367188, + "loss": 0.7366, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.0514113903045654, + "rewards/margins": 0.3458591103553772, + "rewards/rejected": -3.3972702026367188, + "sft_loss": 2.9700963497161865, + "step": 610 + }, + { + "epoch": 0.32915203211239336, + "grad_norm": 11.02894563208569, + "learning_rate": 9.99717116001853e-07, + "logits/chosen": 0.07742391526699066, + "logits/rejected": 0.1711132973432541, + "logps/chosen": -3.041868209838867, + "logps/rejected": -3.5872230529785156, + "loss": 0.6433, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.041868209838867, + "rewards/margins": 0.5453550219535828, + "rewards/rejected": -3.5872230529785156, + "sft_loss": 2.9712107181549072, + "step": 615 + }, + { + "epoch": 0.33182806489379496, + "grad_norm": 13.435575037564124, + "learning_rate": 9.996623109724173e-07, + "logits/chosen": 0.17571452260017395, + "logits/rejected": 0.22912061214447021, + "logps/chosen": -3.235530376434326, + "logps/rejected": -3.6874687671661377, + "loss": 0.6783, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.235530376434326, + "rewards/margins": 0.4519377648830414, + "rewards/rejected": -3.6874687671661377, + "sft_loss": 3.3161187171936035, + "step": 620 + }, + { + "epoch": 0.3345040976751965, + "grad_norm": 11.963965881026747, + "learning_rate": 9.996026582170488e-07, + "logits/chosen": 0.16318285465240479, + "logits/rejected": 0.28264936804771423, + "logps/chosen": -3.1633148193359375, + "logps/rejected": -3.6431198120117188, + "loss": 0.6719, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.1633148193359375, + "rewards/margins": 0.4798053205013275, + "rewards/rejected": -3.6431198120117188, + "sft_loss": 3.2683768272399902, + "step": 625 + }, + { + "epoch": 0.3371801304565981, + "grad_norm": 18.982814005299204, + "learning_rate": 9.995381583144996e-07, + "logits/chosen": 0.1366763710975647, + "logits/rejected": 0.2427402287721634, + "logps/chosen": -3.2480270862579346, + "logps/rejected": -3.759871244430542, + "loss": 0.6599, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.2480270862579346, + "rewards/margins": 0.5118443369865417, + "rewards/rejected": -3.759871244430542, + "sft_loss": 3.296314239501953, + "step": 630 + }, + { + "epoch": 0.33985616323799966, + "grad_norm": 14.820255923961053, + "learning_rate": 9.994688118905471e-07, + "logits/chosen": 0.12495915591716766, + "logits/rejected": 0.33405324816703796, + "logps/chosen": -3.371495008468628, + "logps/rejected": -3.818183183670044, + "loss": 0.6938, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.371495008468628, + "rewards/margins": 0.44668784737586975, + "rewards/rejected": -3.818183183670044, + "sft_loss": 3.383021831512451, + "step": 635 + }, + { + "epoch": 0.3425321960194012, + "grad_norm": 17.167666064407857, + "learning_rate": 9.993946196179912e-07, + "logits/chosen": 0.02807859145104885, + "logits/rejected": 0.20716509222984314, + "logps/chosen": -3.329270601272583, + "logps/rejected": -3.7738585472106934, + "loss": 0.6796, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.329270601272583, + "rewards/margins": 0.4445876479148865, + "rewards/rejected": -3.7738585472106934, + "sft_loss": 3.4169044494628906, + "step": 640 + }, + { + "epoch": 0.3452082288008028, + "grad_norm": 12.591955459885122, + "learning_rate": 9.993155822166455e-07, + "logits/chosen": 0.0873405784368515, + "logits/rejected": 0.1538153886795044, + "logps/chosen": -3.218961715698242, + "logps/rejected": -3.6624724864959717, + "loss": 0.6938, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.218961715698242, + "rewards/margins": 0.44351091980934143, + "rewards/rejected": -3.6624724864959717, + "sft_loss": 3.1909327507019043, + "step": 645 + }, + { + "epoch": 0.34788426158220437, + "grad_norm": 17.348198575764926, + "learning_rate": 9.992317004533313e-07, + "logits/chosen": 0.12633930146694183, + "logits/rejected": 0.23183250427246094, + "logps/chosen": -3.4124233722686768, + "logps/rejected": -3.8890254497528076, + "loss": 0.6747, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.4124233722686768, + "rewards/margins": 0.476602166891098, + "rewards/rejected": -3.8890254497528076, + "sft_loss": 3.488661289215088, + "step": 650 + }, + { + "epoch": 0.350560294363606, + "grad_norm": 19.495944078784344, + "learning_rate": 9.991429751418696e-07, + "logits/chosen": 0.15044409036636353, + "logits/rejected": 0.1703573614358902, + "logps/chosen": -3.2523608207702637, + "logps/rejected": -3.721022844314575, + "loss": 0.7206, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.2523608207702637, + "rewards/margins": 0.468661367893219, + "rewards/rejected": -3.721022844314575, + "sft_loss": 3.3398749828338623, + "step": 655 + }, + { + "epoch": 0.3532363271450075, + "grad_norm": 12.739827332278542, + "learning_rate": 9.99049407143074e-07, + "logits/chosen": 0.10364080965518951, + "logits/rejected": 0.20852389931678772, + "logps/chosen": -3.156210422515869, + "logps/rejected": -3.4665565490722656, + "loss": 0.7243, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.156210422515869, + "rewards/margins": 0.3103458881378174, + "rewards/rejected": -3.4665565490722656, + "sft_loss": 3.1437571048736572, + "step": 660 + }, + { + "epoch": 0.35591235992640907, + "grad_norm": 9.131578117600116, + "learning_rate": 9.989509973647416e-07, + "logits/chosen": 0.09303895384073257, + "logits/rejected": 0.2246369570493698, + "logps/chosen": -3.0079312324523926, + "logps/rejected": -3.3630542755126953, + "loss": 0.7239, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.0079312324523926, + "rewards/margins": 0.35512271523475647, + "rewards/rejected": -3.3630542755126953, + "sft_loss": 3.066347599029541, + "step": 665 + }, + { + "epoch": 0.3585883927078107, + "grad_norm": 12.124152976286277, + "learning_rate": 9.988477467616445e-07, + "logits/chosen": 0.08354990184307098, + "logits/rejected": 0.24085800349712372, + "logps/chosen": -2.965276002883911, + "logps/rejected": -3.3744754791259766, + "loss": 0.6552, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.965276002883911, + "rewards/margins": 0.40919971466064453, + "rewards/rejected": -3.3744754791259766, + "sft_loss": 3.086196184158325, + "step": 670 + }, + { + "epoch": 0.3612644254892122, + "grad_norm": 14.467133514240121, + "learning_rate": 9.987396563355205e-07, + "logits/chosen": 0.09642884135246277, + "logits/rejected": 0.16047967970371246, + "logps/chosen": -2.9351863861083984, + "logps/rejected": -3.4026923179626465, + "loss": 0.6612, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.9351863861083984, + "rewards/margins": 0.46750617027282715, + "rewards/rejected": -3.4026923179626465, + "sft_loss": 2.9935317039489746, + "step": 675 + }, + { + "epoch": 0.36394045827061383, + "grad_norm": 12.228933979475576, + "learning_rate": 9.986267271350631e-07, + "logits/chosen": 0.1422261893749237, + "logits/rejected": 0.28388795256614685, + "logps/chosen": -3.123826503753662, + "logps/rejected": -3.486147403717041, + "loss": 0.7477, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.123826503753662, + "rewards/margins": 0.3623208999633789, + "rewards/rejected": -3.486147403717041, + "sft_loss": 3.068068027496338, + "step": 680 + }, + { + "epoch": 0.3666164910520154, + "grad_norm": 22.57125542560206, + "learning_rate": 9.985089602559123e-07, + "logits/chosen": 0.13599984347820282, + "logits/rejected": 0.2769896388053894, + "logps/chosen": -3.2404942512512207, + "logps/rejected": -3.6959774494171143, + "loss": 0.6779, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.2404942512512207, + "rewards/margins": 0.45548272132873535, + "rewards/rejected": -3.6959774494171143, + "sft_loss": 3.2109158039093018, + "step": 685 + }, + { + "epoch": 0.369292523833417, + "grad_norm": 18.452385824632163, + "learning_rate": 9.983863568406428e-07, + "logits/chosen": 0.16817834973335266, + "logits/rejected": 0.19235409796237946, + "logps/chosen": -3.385051727294922, + "logps/rejected": -3.7719883918762207, + "loss": 0.7072, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.385051727294922, + "rewards/margins": 0.38693660497665405, + "rewards/rejected": -3.7719883918762207, + "sft_loss": 3.4286434650421143, + "step": 690 + }, + { + "epoch": 0.37196855661481854, + "grad_norm": 11.372268524897146, + "learning_rate": 9.982589180787532e-07, + "logits/chosen": 0.08407465368509293, + "logits/rejected": 0.15698882937431335, + "logps/chosen": -3.1884450912475586, + "logps/rejected": -3.640734910964966, + "loss": 0.656, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.1884450912475586, + "rewards/margins": 0.4522898197174072, + "rewards/rejected": -3.640734910964966, + "sft_loss": 3.3043384552001953, + "step": 695 + }, + { + "epoch": 0.3746445893962201, + "grad_norm": 21.43770166832855, + "learning_rate": 9.981266452066553e-07, + "logits/chosen": 0.021909546107053757, + "logits/rejected": 0.1328798085451126, + "logps/chosen": -3.5039210319519043, + "logps/rejected": -3.8362011909484863, + "loss": 0.6862, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.5039210319519043, + "rewards/margins": 0.3322799503803253, + "rewards/rejected": -3.8362011909484863, + "sft_loss": 3.4569506645202637, + "step": 700 + }, + { + "epoch": 0.3773206221776217, + "grad_norm": 14.27115720520455, + "learning_rate": 9.979895395076608e-07, + "logits/chosen": 0.07725761830806732, + "logits/rejected": 0.2512449622154236, + "logps/chosen": -3.555420398712158, + "logps/rejected": -4.093667507171631, + "loss": 0.6392, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.555420398712158, + "rewards/margins": 0.5382481813430786, + "rewards/rejected": -4.093667507171631, + "sft_loss": 3.611607789993286, + "step": 705 + }, + { + "epoch": 0.37999665495902324, + "grad_norm": 18.789545823364513, + "learning_rate": 9.9784760231197e-07, + "logits/chosen": 0.10939677059650421, + "logits/rejected": 0.20701321959495544, + "logps/chosen": -3.553346633911133, + "logps/rejected": -4.05139684677124, + "loss": 0.6367, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.553346633911133, + "rewards/margins": 0.4980502128601074, + "rewards/rejected": -4.05139684677124, + "sft_loss": 3.549403429031372, + "step": 710 + }, + { + "epoch": 0.38267268774042484, + "grad_norm": 20.283066722727373, + "learning_rate": 9.97700834996658e-07, + "logits/chosen": 0.0789613127708435, + "logits/rejected": 0.2348795384168625, + "logps/chosen": -3.755260467529297, + "logps/rejected": -4.174482822418213, + "loss": 0.6705, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.755260467529297, + "rewards/margins": 0.41922205686569214, + "rewards/rejected": -4.174482822418213, + "sft_loss": 3.7192587852478027, + "step": 715 + }, + { + "epoch": 0.3853487205218264, + "grad_norm": 17.71024407775548, + "learning_rate": 9.97549238985662e-07, + "logits/chosen": 0.1549479067325592, + "logits/rejected": 0.32185259461402893, + "logps/chosen": -3.6697444915771484, + "logps/rejected": -4.121180057525635, + "loss": 0.6921, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.6697444915771484, + "rewards/margins": 0.45143526792526245, + "rewards/rejected": -4.121180057525635, + "sft_loss": 3.7580153942108154, + "step": 720 + }, + { + "epoch": 0.38802475330322794, + "grad_norm": 27.931643171716484, + "learning_rate": 9.973928157497674e-07, + "logits/chosen": 0.07944132387638092, + "logits/rejected": 0.20254309475421906, + "logps/chosen": -3.4026761054992676, + "logps/rejected": -3.8807315826416016, + "loss": 0.6469, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.4026761054992676, + "rewards/margins": 0.4780558943748474, + "rewards/rejected": -3.8807315826416016, + "sft_loss": 3.532989978790283, + "step": 725 + }, + { + "epoch": 0.39070078608462955, + "grad_norm": 14.385729673838856, + "learning_rate": 9.972315668065927e-07, + "logits/chosen": 0.07879441976547241, + "logits/rejected": 0.19542153179645538, + "logps/chosen": -3.5237643718719482, + "logps/rejected": -3.905362606048584, + "loss": 0.7068, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.5237643718719482, + "rewards/margins": 0.38159847259521484, + "rewards/rejected": -3.905362606048584, + "sft_loss": 3.5806221961975098, + "step": 730 + }, + { + "epoch": 0.3933768188660311, + "grad_norm": 13.641550923885305, + "learning_rate": 9.97065493720576e-07, + "logits/chosen": 0.0845077782869339, + "logits/rejected": 0.1788623034954071, + "logps/chosen": -3.2255218029022217, + "logps/rejected": -3.5273890495300293, + "loss": 0.7208, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.2255218029022217, + "rewards/margins": 0.3018674850463867, + "rewards/rejected": -3.5273890495300293, + "sft_loss": 3.248542308807373, + "step": 735 + }, + { + "epoch": 0.3960528516474327, + "grad_norm": 26.105825510166056, + "learning_rate": 9.968945981029594e-07, + "logits/chosen": 0.12168808281421661, + "logits/rejected": 0.2733016908168793, + "logps/chosen": -3.3159337043762207, + "logps/rejected": -3.775067090988159, + "loss": 0.6729, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.3159337043762207, + "rewards/margins": 0.4591336250305176, + "rewards/rejected": -3.775067090988159, + "sft_loss": 3.2262046337127686, + "step": 740 + }, + { + "epoch": 0.39872888442883425, + "grad_norm": 11.117385711267735, + "learning_rate": 9.967188816117726e-07, + "logits/chosen": 0.17880137264728546, + "logits/rejected": 0.2560071051120758, + "logps/chosen": -3.331556797027588, + "logps/rejected": -3.8179938793182373, + "loss": 0.686, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.331556797027588, + "rewards/margins": 0.4864373803138733, + "rewards/rejected": -3.8179938793182373, + "sft_loss": 3.3403239250183105, + "step": 745 + }, + { + "epoch": 0.4014049172102358, + "grad_norm": 12.031202800814468, + "learning_rate": 9.965383459518179e-07, + "logits/chosen": 0.08572041988372803, + "logits/rejected": 0.23330307006835938, + "logps/chosen": -3.3558125495910645, + "logps/rejected": -3.828129291534424, + "loss": 0.6601, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.3558125495910645, + "rewards/margins": 0.4723171293735504, + "rewards/rejected": -3.828129291534424, + "sft_loss": 3.398761749267578, + "step": 750 + }, + { + "epoch": 0.4040809499916374, + "grad_norm": 13.472832754646447, + "learning_rate": 9.963529928746533e-07, + "logits/chosen": 0.15691408514976501, + "logits/rejected": 0.2690011262893677, + "logps/chosen": -3.389195203781128, + "logps/rejected": -3.7743306159973145, + "loss": 0.7096, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.389195203781128, + "rewards/margins": 0.38513538241386414, + "rewards/rejected": -3.7743306159973145, + "sft_loss": 3.376270294189453, + "step": 755 + }, + { + "epoch": 0.40675698277303896, + "grad_norm": 11.454275325872517, + "learning_rate": 9.961628241785746e-07, + "logits/chosen": 0.07620684802532196, + "logits/rejected": 0.15791398286819458, + "logps/chosen": -3.3654396533966064, + "logps/rejected": -3.7810680866241455, + "loss": 0.6903, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.3654396533966064, + "rewards/margins": 0.4156281352043152, + "rewards/rejected": -3.7810680866241455, + "sft_loss": 3.4098827838897705, + "step": 760 + }, + { + "epoch": 0.40943301555444056, + "grad_norm": 11.654710079787488, + "learning_rate": 9.959678417085998e-07, + "logits/chosen": 0.0927480012178421, + "logits/rejected": 0.17140790820121765, + "logps/chosen": -3.2945895195007324, + "logps/rejected": -3.6890273094177246, + "loss": 0.6735, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.2945895195007324, + "rewards/margins": 0.3944377601146698, + "rewards/rejected": -3.6890273094177246, + "sft_loss": 3.3298301696777344, + "step": 765 + }, + { + "epoch": 0.4121090483358421, + "grad_norm": 13.564142616226638, + "learning_rate": 9.957680473564493e-07, + "logits/chosen": 0.20415227115154266, + "logits/rejected": 0.3103726804256439, + "logps/chosen": -3.2019429206848145, + "logps/rejected": -3.724005937576294, + "loss": 0.6451, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.2019429206848145, + "rewards/margins": 0.5220627784729004, + "rewards/rejected": -3.724005937576294, + "sft_loss": 3.1217987537384033, + "step": 770 + }, + { + "epoch": 0.41478508111724366, + "grad_norm": 8.788907244047499, + "learning_rate": 9.95563443060529e-07, + "logits/chosen": 0.05300183221697807, + "logits/rejected": 0.19994166493415833, + "logps/chosen": -3.236523389816284, + "logps/rejected": -3.5783581733703613, + "loss": 0.7294, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.236523389816284, + "rewards/margins": 0.3418344557285309, + "rewards/rejected": -3.5783581733703613, + "sft_loss": 3.2097830772399902, + "step": 775 + }, + { + "epoch": 0.41746111389864526, + "grad_norm": 13.11357065397142, + "learning_rate": 9.95354030805911e-07, + "logits/chosen": 0.0014542639255523682, + "logits/rejected": 0.12298593670129776, + "logps/chosen": -3.181215286254883, + "logps/rejected": -3.5314762592315674, + "loss": 0.6809, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.181215286254883, + "rewards/margins": 0.3502606749534607, + "rewards/rejected": -3.5314762592315674, + "sft_loss": 3.2443747520446777, + "step": 780 + }, + { + "epoch": 0.4201371466800468, + "grad_norm": 11.935322142601875, + "learning_rate": 9.951398126243133e-07, + "logits/chosen": 0.12124574184417725, + "logits/rejected": 0.22774991393089294, + "logps/chosen": -3.2135891914367676, + "logps/rejected": -3.64837384223938, + "loss": 0.6878, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.2135891914367676, + "rewards/margins": 0.43478527665138245, + "rewards/rejected": -3.64837384223938, + "sft_loss": 3.2128987312316895, + "step": 785 + }, + { + "epoch": 0.4228131794614484, + "grad_norm": 11.666360539461333, + "learning_rate": 9.94920790594082e-07, + "logits/chosen": 0.07049150764942169, + "logits/rejected": 0.1756301373243332, + "logps/chosen": -3.114420175552368, + "logps/rejected": -3.6367697715759277, + "loss": 0.628, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.114420175552368, + "rewards/margins": 0.5223496556282043, + "rewards/rejected": -3.6367697715759277, + "sft_loss": 3.0744128227233887, + "step": 790 + }, + { + "epoch": 0.42548921224284997, + "grad_norm": 11.228129348682602, + "learning_rate": 9.946969668401696e-07, + "logits/chosen": 0.05934765189886093, + "logits/rejected": 0.21110644936561584, + "logps/chosen": -3.3124725818634033, + "logps/rejected": -3.8809173107147217, + "loss": 0.642, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.3124725818634033, + "rewards/margins": 0.5684444904327393, + "rewards/rejected": -3.8809173107147217, + "sft_loss": 3.352839708328247, + "step": 795 + }, + { + "epoch": 0.4281652450242516, + "grad_norm": 12.42314167315067, + "learning_rate": 9.944683435341155e-07, + "logits/chosen": 0.04263642802834511, + "logits/rejected": 0.10475891828536987, + "logps/chosen": -3.370643138885498, + "logps/rejected": -3.90073823928833, + "loss": 0.6224, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.370643138885498, + "rewards/margins": 0.5300950407981873, + "rewards/rejected": -3.90073823928833, + "sft_loss": 3.352079391479492, + "step": 800 + }, + { + "epoch": 0.4281652450242516, + "eval_logits/chosen": 0.30916711688041687, + "eval_logits/rejected": 0.3857182264328003, + "eval_logps/chosen": -3.476654529571533, + "eval_logps/rejected": -4.0245490074157715, + "eval_loss": 0.628761887550354, + "eval_rewards/accuracies": 0.6869435906410217, + "eval_rewards/chosen": -3.476654529571533, + "eval_rewards/margins": 0.5478941798210144, + "eval_rewards/rejected": -4.0245490074157715, + "eval_runtime": 42.9456, + "eval_samples_per_second": 31.319, + "eval_sft_loss": 3.4512197971343994, + "eval_steps_per_second": 7.847, + "step": 800 + }, + { + "epoch": 0.4308412778056531, + "grad_norm": 15.285719140308519, + "learning_rate": 9.942349228940236e-07, + "logits/chosen": 0.029286552220582962, + "logits/rejected": 0.15155065059661865, + "logps/chosen": -3.489750623703003, + "logps/rejected": -4.151576042175293, + "loss": 0.5868, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.489750623703003, + "rewards/margins": 0.6618257164955139, + "rewards/rejected": -4.151576042175293, + "sft_loss": 3.473094940185547, + "step": 805 + }, + { + "epoch": 0.43351731058705467, + "grad_norm": 16.26548162705179, + "learning_rate": 9.939967071845424e-07, + "logits/chosen": 0.09252439439296722, + "logits/rejected": 0.16039565205574036, + "logps/chosen": -3.74686861038208, + "logps/rejected": -4.148434638977051, + "loss": 0.6825, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.74686861038208, + "rewards/margins": 0.40156635642051697, + "rewards/rejected": -4.148434638977051, + "sft_loss": 3.8165481090545654, + "step": 810 + }, + { + "epoch": 0.4361933433684563, + "grad_norm": 12.521564977020468, + "learning_rate": 9.937536987168413e-07, + "logits/chosen": 0.10305075347423553, + "logits/rejected": 0.20441201329231262, + "logps/chosen": -3.5569655895233154, + "logps/rejected": -4.217929840087891, + "loss": 0.6361, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.5569655895233154, + "rewards/margins": 0.6609641313552856, + "rewards/rejected": -4.217929840087891, + "sft_loss": 3.6337389945983887, + "step": 815 + }, + { + "epoch": 0.4388693761498578, + "grad_norm": 16.53507037989396, + "learning_rate": 9.935058998485896e-07, + "logits/chosen": 0.12354373931884766, + "logits/rejected": 0.16019897162914276, + "logps/chosen": -3.6725382804870605, + "logps/rejected": -4.262721061706543, + "loss": 0.6498, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.6725382804870605, + "rewards/margins": 0.5901829600334167, + "rewards/rejected": -4.262721061706543, + "sft_loss": 3.66874623298645, + "step": 820 + }, + { + "epoch": 0.44154540893125943, + "grad_norm": 20.293552669583075, + "learning_rate": 9.932533129839333e-07, + "logits/chosen": 0.10642262548208237, + "logits/rejected": 0.2096632421016693, + "logps/chosen": -3.6120095252990723, + "logps/rejected": -4.203782081604004, + "loss": 0.6397, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.6120095252990723, + "rewards/margins": 0.591772198677063, + "rewards/rejected": -4.203782081604004, + "sft_loss": 3.7229583263397217, + "step": 825 + }, + { + "epoch": 0.444221441712661, + "grad_norm": 14.193848823920673, + "learning_rate": 9.929959405734711e-07, + "logits/chosen": 0.12132594734430313, + "logits/rejected": 0.25849542021751404, + "logps/chosen": -3.58459210395813, + "logps/rejected": -4.052337646484375, + "loss": 0.6578, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.58459210395813, + "rewards/margins": 0.46774548292160034, + "rewards/rejected": -4.052337646484375, + "sft_loss": 3.5832104682922363, + "step": 830 + }, + { + "epoch": 0.44689747449406253, + "grad_norm": 17.16303931069294, + "learning_rate": 9.927337851142314e-07, + "logits/chosen": 0.08140738308429718, + "logits/rejected": 0.18814049661159515, + "logps/chosen": -3.5765693187713623, + "logps/rejected": -4.066720485687256, + "loss": 0.6598, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.5765693187713623, + "rewards/margins": 0.4901511073112488, + "rewards/rejected": -4.066720485687256, + "sft_loss": 3.7092864513397217, + "step": 835 + }, + { + "epoch": 0.44957350727546413, + "grad_norm": 14.101710559555091, + "learning_rate": 9.924668491496474e-07, + "logits/chosen": 0.025357728824019432, + "logits/rejected": 0.19356423616409302, + "logps/chosen": -3.731132984161377, + "logps/rejected": -4.178229808807373, + "loss": 0.7033, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.731132984161377, + "rewards/margins": 0.4470970034599304, + "rewards/rejected": -4.178229808807373, + "sft_loss": 3.84734845161438, + "step": 840 + }, + { + "epoch": 0.4522495400568657, + "grad_norm": 11.084723277082443, + "learning_rate": 9.92195135269533e-07, + "logits/chosen": 0.11301273107528687, + "logits/rejected": 0.16978143155574799, + "logps/chosen": -3.6617302894592285, + "logps/rejected": -4.03924036026001, + "loss": 0.7095, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.6617302894592285, + "rewards/margins": 0.3775102496147156, + "rewards/rejected": -4.03924036026001, + "sft_loss": 3.7795937061309814, + "step": 845 + }, + { + "epoch": 0.4549255728382673, + "grad_norm": 12.211834887067432, + "learning_rate": 9.919186461100574e-07, + "logits/chosen": 0.0779627338051796, + "logits/rejected": 0.14930522441864014, + "logps/chosen": -3.7439913749694824, + "logps/rejected": -4.1387457847595215, + "loss": 0.6733, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.7439913749694824, + "rewards/margins": 0.394754558801651, + "rewards/rejected": -4.1387457847595215, + "sft_loss": 3.746532917022705, + "step": 850 + }, + { + "epoch": 0.45760160561966884, + "grad_norm": 31.70471938835657, + "learning_rate": 9.9163738435372e-07, + "logits/chosen": 0.0450650155544281, + "logits/rejected": 0.16089345514774323, + "logps/chosen": -3.6246726512908936, + "logps/rejected": -4.134470462799072, + "loss": 0.6901, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.6246726512908936, + "rewards/margins": 0.5097973346710205, + "rewards/rejected": -4.134470462799072, + "sft_loss": 3.6062686443328857, + "step": 855 + }, + { + "epoch": 0.4602776384010704, + "grad_norm": 8.598859788111836, + "learning_rate": 9.913513527293234e-07, + "logits/chosen": 0.006855173502117395, + "logits/rejected": 0.14971241354942322, + "logps/chosen": -3.5897128582000732, + "logps/rejected": -4.210363388061523, + "loss": 0.6252, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.5897128582000732, + "rewards/margins": 0.6206499934196472, + "rewards/rejected": -4.210363388061523, + "sft_loss": 3.5778141021728516, + "step": 860 + }, + { + "epoch": 0.462953671182472, + "grad_norm": 37.65213597743668, + "learning_rate": 9.910605540119474e-07, + "logits/chosen": 0.08587872982025146, + "logits/rejected": 0.16899526119232178, + "logps/chosen": -3.4890365600585938, + "logps/rejected": -4.047313690185547, + "loss": 0.6675, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.4890365600585938, + "rewards/margins": 0.5582772493362427, + "rewards/rejected": -4.047313690185547, + "sft_loss": 3.3437390327453613, + "step": 865 + }, + { + "epoch": 0.46562970396387354, + "grad_norm": 11.543128601737179, + "learning_rate": 9.907649910229227e-07, + "logits/chosen": -0.027017872780561447, + "logits/rejected": 0.1710745394229889, + "logps/chosen": -3.376879930496216, + "logps/rejected": -3.9517147541046143, + "loss": 0.6164, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.376879930496216, + "rewards/margins": 0.5748350024223328, + "rewards/rejected": -3.9517147541046143, + "sft_loss": 3.3898417949676514, + "step": 870 + }, + { + "epoch": 0.46830573674527515, + "grad_norm": 15.187532658706145, + "learning_rate": 9.90464666629803e-07, + "logits/chosen": 0.05295708775520325, + "logits/rejected": 0.1010611280798912, + "logps/chosen": -3.346054792404175, + "logps/rejected": -3.7483839988708496, + "loss": 0.7253, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.346054792404175, + "rewards/margins": 0.40232938528060913, + "rewards/rejected": -3.7483839988708496, + "sft_loss": 3.3345093727111816, + "step": 875 + }, + { + "epoch": 0.4709817695266767, + "grad_norm": 8.845824171927392, + "learning_rate": 9.901595837463363e-07, + "logits/chosen": 0.007680124137550592, + "logits/rejected": 0.1661723554134369, + "logps/chosen": -3.394366502761841, + "logps/rejected": -3.9658546447753906, + "loss": 0.6212, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.394366502761841, + "rewards/margins": 0.5714882612228394, + "rewards/rejected": -3.9658546447753906, + "sft_loss": 3.316911220550537, + "step": 880 + }, + { + "epoch": 0.47365780230807825, + "grad_norm": 16.21241632674145, + "learning_rate": 9.898497453324384e-07, + "logits/chosen": -0.06916798651218414, + "logits/rejected": 0.007078188471496105, + "logps/chosen": -3.4882004261016846, + "logps/rejected": -4.050824165344238, + "loss": 0.605, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.4882004261016846, + "rewards/margins": 0.5626236200332642, + "rewards/rejected": -4.050824165344238, + "sft_loss": 3.5420258045196533, + "step": 885 + }, + { + "epoch": 0.47633383508947985, + "grad_norm": 12.373306544762638, + "learning_rate": 9.895351543941628e-07, + "logits/chosen": -0.14452961087226868, + "logits/rejected": -0.04519026353955269, + "logps/chosen": -3.458292007446289, + "logps/rejected": -3.9274654388427734, + "loss": 0.6591, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.458292007446289, + "rewards/margins": 0.46917352080345154, + "rewards/rejected": -3.9274654388427734, + "sft_loss": 3.5802388191223145, + "step": 890 + }, + { + "epoch": 0.4790098678708814, + "grad_norm": 13.304136856842742, + "learning_rate": 9.892158139836724e-07, + "logits/chosen": 0.01676345430314541, + "logits/rejected": 0.08211679756641388, + "logps/chosen": -3.4115653038024902, + "logps/rejected": -3.8524603843688965, + "loss": 0.6597, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.4115653038024902, + "rewards/margins": 0.4408953785896301, + "rewards/rejected": -3.8524603843688965, + "sft_loss": 3.483126401901245, + "step": 895 + }, + { + "epoch": 0.481685900652283, + "grad_norm": 16.428604159117846, + "learning_rate": 9.88891727199209e-07, + "logits/chosen": -0.055256836116313934, + "logits/rejected": 0.018062064424157143, + "logps/chosen": -3.339559555053711, + "logps/rejected": -3.853358030319214, + "loss": 0.6395, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.339559555053711, + "rewards/margins": 0.5137983560562134, + "rewards/rejected": -3.853358030319214, + "sft_loss": 3.397245407104492, + "step": 900 + }, + { + "epoch": 0.48436193343368455, + "grad_norm": 14.82089637749487, + "learning_rate": 9.885628971850641e-07, + "logits/chosen": -0.007930848747491837, + "logits/rejected": 0.14382946491241455, + "logps/chosen": -3.4519202709198, + "logps/rejected": -4.07681941986084, + "loss": 0.6226, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.4519202709198, + "rewards/margins": 0.62489914894104, + "rewards/rejected": -4.07681941986084, + "sft_loss": 3.538830280303955, + "step": 905 + }, + { + "epoch": 0.48703796621508616, + "grad_norm": 13.842999888489807, + "learning_rate": 9.882293271315481e-07, + "logits/chosen": -0.04674383997917175, + "logits/rejected": 0.027312126010656357, + "logps/chosen": -3.600853681564331, + "logps/rejected": -4.066799640655518, + "loss": 0.6763, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.600853681564331, + "rewards/margins": 0.4659459590911865, + "rewards/rejected": -4.066799640655518, + "sft_loss": 3.625293731689453, + "step": 910 + }, + { + "epoch": 0.4897139989964877, + "grad_norm": 13.496629124081092, + "learning_rate": 9.878910202749589e-07, + "logits/chosen": -0.03229089826345444, + "logits/rejected": 0.11424863338470459, + "logps/chosen": -3.6046853065490723, + "logps/rejected": -4.177473068237305, + "loss": 0.6267, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.6046853065490723, + "rewards/margins": 0.572787880897522, + "rewards/rejected": -4.177473068237305, + "sft_loss": 3.654684066772461, + "step": 915 + }, + { + "epoch": 0.49239003177788926, + "grad_norm": 14.754415801413213, + "learning_rate": 9.875479798975512e-07, + "logits/chosen": 0.013253210112452507, + "logits/rejected": 0.14170649647712708, + "logps/chosen": -3.6022396087646484, + "logps/rejected": -4.2444329261779785, + "loss": 0.6264, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.6022396087646484, + "rewards/margins": 0.6421931982040405, + "rewards/rejected": -4.2444329261779785, + "sft_loss": 3.753528594970703, + "step": 920 + }, + { + "epoch": 0.49506606455929086, + "grad_norm": 16.88476581071326, + "learning_rate": 9.87200209327504e-07, + "logits/chosen": -0.02223074436187744, + "logits/rejected": 0.0964256003499031, + "logps/chosen": -3.945333957672119, + "logps/rejected": -4.436131954193115, + "loss": 0.6546, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.945333957672119, + "rewards/margins": 0.49079805612564087, + "rewards/rejected": -4.436131954193115, + "sft_loss": 3.862926483154297, + "step": 925 + }, + { + "epoch": 0.4977420973406924, + "grad_norm": 20.326567899701928, + "learning_rate": 9.868477119388894e-07, + "logits/chosen": -0.028742101043462753, + "logits/rejected": 0.03769497200846672, + "logps/chosen": -3.7966148853302, + "logps/rejected": -4.435694694519043, + "loss": 0.6393, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.7966148853302, + "rewards/margins": 0.6390798687934875, + "rewards/rejected": -4.435694694519043, + "sft_loss": 3.8229899406433105, + "step": 930 + }, + { + "epoch": 0.500418130122094, + "grad_norm": 13.99872257148215, + "learning_rate": 9.864904911516383e-07, + "logits/chosen": 0.010284209623932838, + "logits/rejected": 0.044766783714294434, + "logps/chosen": -3.979724884033203, + "logps/rejected": -4.445012092590332, + "loss": 0.6638, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.979724884033203, + "rewards/margins": 0.4652870297431946, + "rewards/rejected": -4.445012092590332, + "sft_loss": 4.037301063537598, + "step": 935 + }, + { + "epoch": 0.5030941629034956, + "grad_norm": 16.86367004869719, + "learning_rate": 9.861285504315084e-07, + "logits/chosen": -0.012009387835860252, + "logits/rejected": 0.061289478093385696, + "logps/chosen": -3.8050410747528076, + "logps/rejected": -4.29853630065918, + "loss": 0.6353, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.8050410747528076, + "rewards/margins": 0.49349552392959595, + "rewards/rejected": -4.29853630065918, + "sft_loss": 3.802018642425537, + "step": 940 + }, + { + "epoch": 0.5057701956848971, + "grad_norm": 13.932891382776969, + "learning_rate": 9.857618932900502e-07, + "logits/chosen": 0.0036790785379707813, + "logits/rejected": 0.10150575637817383, + "logps/chosen": -3.826122283935547, + "logps/rejected": -4.335418701171875, + "loss": 0.6438, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.826122283935547, + "rewards/margins": 0.5092965364456177, + "rewards/rejected": -4.335418701171875, + "sft_loss": 3.819323778152466, + "step": 945 + }, + { + "epoch": 0.5084462284662987, + "grad_norm": 13.970445556471764, + "learning_rate": 9.853905232845727e-07, + "logits/chosen": 0.023456787690520287, + "logits/rejected": 0.14656312763690948, + "logps/chosen": -3.714625835418701, + "logps/rejected": -4.2214674949646, + "loss": 0.6547, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.714625835418701, + "rewards/margins": 0.5068413615226746, + "rewards/rejected": -4.2214674949646, + "sft_loss": 3.676867961883545, + "step": 950 + }, + { + "epoch": 0.5111222612477003, + "grad_norm": 13.66631939092454, + "learning_rate": 9.850144440181095e-07, + "logits/chosen": 0.08379775285720825, + "logits/rejected": 0.22799447178840637, + "logps/chosen": -3.87310791015625, + "logps/rejected": -4.435972690582275, + "loss": 0.6092, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.87310791015625, + "rewards/margins": 0.5628647804260254, + "rewards/rejected": -4.435972690582275, + "sft_loss": 3.9427828788757324, + "step": 955 + }, + { + "epoch": 0.5137982940291018, + "grad_norm": 16.58644478757467, + "learning_rate": 9.846336591393832e-07, + "logits/chosen": 0.08455801755189896, + "logits/rejected": 0.18910866975784302, + "logps/chosen": -3.9857611656188965, + "logps/rejected": -4.546139717102051, + "loss": 0.6454, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.9857611656188965, + "rewards/margins": 0.5603781342506409, + "rewards/rejected": -4.546139717102051, + "sft_loss": 4.061606407165527, + "step": 960 + }, + { + "epoch": 0.5164743268105034, + "grad_norm": 13.756191723689144, + "learning_rate": 9.842481723427704e-07, + "logits/chosen": 0.09445828199386597, + "logits/rejected": 0.11051924526691437, + "logps/chosen": -4.213374137878418, + "logps/rejected": -4.733959197998047, + "loss": 0.694, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.213374137878418, + "rewards/margins": 0.520584762096405, + "rewards/rejected": -4.733959197998047, + "sft_loss": 4.2486796379089355, + "step": 965 + }, + { + "epoch": 0.519150359591905, + "grad_norm": 13.816243024663285, + "learning_rate": 9.838579873682658e-07, + "logits/chosen": 0.09202511608600616, + "logits/rejected": 0.09299103915691376, + "logps/chosen": -3.9934258460998535, + "logps/rejected": -4.433906078338623, + "loss": 0.6682, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.9934258460998535, + "rewards/margins": 0.4404802918434143, + "rewards/rejected": -4.433906078338623, + "sft_loss": 4.066201686859131, + "step": 970 + }, + { + "epoch": 0.5218263923733065, + "grad_norm": 10.847829039403418, + "learning_rate": 9.834631080014457e-07, + "logits/chosen": 0.015875589102506638, + "logits/rejected": 0.1718440055847168, + "logps/chosen": -3.8322250843048096, + "logps/rejected": -4.449212551116943, + "loss": 0.5865, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.8322250843048096, + "rewards/margins": 0.6169876456260681, + "rewards/rejected": -4.449212551116943, + "sft_loss": 3.9336464405059814, + "step": 975 + }, + { + "epoch": 0.5245024251547081, + "grad_norm": 20.731197033134084, + "learning_rate": 9.830635380734312e-07, + "logits/chosen": 0.017926190048456192, + "logits/rejected": 0.14910198748111725, + "logps/chosen": -3.856818675994873, + "logps/rejected": -4.3431854248046875, + "loss": 0.6517, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.856818675994873, + "rewards/margins": 0.4863665699958801, + "rewards/rejected": -4.3431854248046875, + "sft_loss": 3.9083619117736816, + "step": 980 + }, + { + "epoch": 0.5271784579361097, + "grad_norm": 12.502144291648296, + "learning_rate": 9.826592814608517e-07, + "logits/chosen": 0.09634266048669815, + "logits/rejected": 0.2468501776456833, + "logps/chosen": -3.7185311317443848, + "logps/rejected": -4.257325172424316, + "loss": 0.6265, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.7185311317443848, + "rewards/margins": 0.5387943983078003, + "rewards/rejected": -4.257325172424316, + "sft_loss": 3.7992255687713623, + "step": 985 + }, + { + "epoch": 0.5298544907175113, + "grad_norm": 10.0780790134523, + "learning_rate": 9.822503420858067e-07, + "logits/chosen": 0.17207573354244232, + "logits/rejected": 0.18692907691001892, + "logps/chosen": -3.6508517265319824, + "logps/rejected": -4.257552146911621, + "loss": 0.6139, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.6508517265319824, + "rewards/margins": 0.6067003011703491, + "rewards/rejected": -4.257552146911621, + "sft_loss": 3.8181662559509277, + "step": 990 + }, + { + "epoch": 0.5325305234989128, + "grad_norm": 13.754268295572302, + "learning_rate": 9.818367239158277e-07, + "logits/chosen": 0.15358421206474304, + "logits/rejected": 0.20071625709533691, + "logps/chosen": -3.7733490467071533, + "logps/rejected": -4.283802032470703, + "loss": 0.6637, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.7733490467071533, + "rewards/margins": 0.5104531049728394, + "rewards/rejected": -4.283802032470703, + "sft_loss": 3.8505585193634033, + "step": 995 + }, + { + "epoch": 0.5352065562803144, + "grad_norm": 14.788821967814014, + "learning_rate": 9.8141843096384e-07, + "logits/chosen": 0.1338832527399063, + "logits/rejected": 0.23015956580638885, + "logps/chosen": -4.02156925201416, + "logps/rejected": -4.630791664123535, + "loss": 0.6083, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.02156925201416, + "rewards/margins": 0.6092226505279541, + "rewards/rejected": -4.630791664123535, + "sft_loss": 4.029107093811035, + "step": 1000 + }, + { + "epoch": 0.537882589061716, + "grad_norm": 19.007935191666654, + "learning_rate": 9.809954672881237e-07, + "logits/chosen": 0.12174554914236069, + "logits/rejected": 0.23447482287883759, + "logps/chosen": -4.086304664611816, + "logps/rejected": -4.584083557128906, + "loss": 0.6822, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.086304664611816, + "rewards/margins": 0.49777883291244507, + "rewards/rejected": -4.584083557128906, + "sft_loss": 4.131983757019043, + "step": 1005 + }, + { + "epoch": 0.5405586218431175, + "grad_norm": 12.754319708012039, + "learning_rate": 9.80567836992274e-07, + "logits/chosen": 0.12806352972984314, + "logits/rejected": 0.26299232244491577, + "logps/chosen": -3.8444855213165283, + "logps/rejected": -4.5119428634643555, + "loss": 0.621, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.8444855213165283, + "rewards/margins": 0.6674574017524719, + "rewards/rejected": -4.5119428634643555, + "sft_loss": 3.909728527069092, + "step": 1010 + }, + { + "epoch": 0.5432346546245191, + "grad_norm": 11.84982709117696, + "learning_rate": 9.801355442251625e-07, + "logits/chosen": 0.1154123991727829, + "logits/rejected": 0.22723989188671112, + "logps/chosen": -3.721278429031372, + "logps/rejected": -4.259707927703857, + "loss": 0.6454, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.721278429031372, + "rewards/margins": 0.5384299159049988, + "rewards/rejected": -4.259707927703857, + "sft_loss": 3.7684149742126465, + "step": 1015 + }, + { + "epoch": 0.5459106874059207, + "grad_norm": 16.10253215157953, + "learning_rate": 9.796985931808949e-07, + "logits/chosen": 0.09676255285739899, + "logits/rejected": 0.21589374542236328, + "logps/chosen": -3.759204864501953, + "logps/rejected": -4.318037986755371, + "loss": 0.622, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.759204864501953, + "rewards/margins": 0.5588343739509583, + "rewards/rejected": -4.318037986755371, + "sft_loss": 3.8170742988586426, + "step": 1020 + }, + { + "epoch": 0.5485867201873222, + "grad_norm": 16.177105588352344, + "learning_rate": 9.792569880987724e-07, + "logits/chosen": 0.04962032288312912, + "logits/rejected": 0.1333126276731491, + "logps/chosen": -3.7688231468200684, + "logps/rejected": -4.422165393829346, + "loss": 0.6031, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.7688231468200684, + "rewards/margins": 0.6533424258232117, + "rewards/rejected": -4.422165393829346, + "sft_loss": 3.7961933612823486, + "step": 1025 + }, + { + "epoch": 0.5512627529687238, + "grad_norm": 20.099779652750147, + "learning_rate": 9.788107332632493e-07, + "logits/chosen": 0.10792167484760284, + "logits/rejected": 0.17767611145973206, + "logps/chosen": -3.7205605506896973, + "logps/rejected": -4.1855549812316895, + "loss": 0.6964, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.7205605506896973, + "rewards/margins": 0.464994341135025, + "rewards/rejected": -4.1855549812316895, + "sft_loss": 3.749657392501831, + "step": 1030 + }, + { + "epoch": 0.5539387857501255, + "grad_norm": 11.803497746097289, + "learning_rate": 9.783598330038924e-07, + "logits/chosen": 0.08459456264972687, + "logits/rejected": 0.16978034377098083, + "logps/chosen": -3.7540791034698486, + "logps/rejected": -4.233614444732666, + "loss": 0.6499, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.7540791034698486, + "rewards/margins": 0.47953516244888306, + "rewards/rejected": -4.233614444732666, + "sft_loss": 3.757070541381836, + "step": 1035 + }, + { + "epoch": 0.5566148185315271, + "grad_norm": 16.181353128715678, + "learning_rate": 9.779042916953376e-07, + "logits/chosen": 0.088498555123806, + "logits/rejected": 0.21762952208518982, + "logps/chosen": -3.5868847370147705, + "logps/rejected": -4.343627452850342, + "loss": 0.5831, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.5868847370147705, + "rewards/margins": 0.7567430734634399, + "rewards/rejected": -4.343627452850342, + "sft_loss": 3.650301456451416, + "step": 1040 + }, + { + "epoch": 0.5592908513129285, + "grad_norm": 12.80569022038835, + "learning_rate": 9.774441137572487e-07, + "logits/chosen": 0.022781116887927055, + "logits/rejected": 0.12793084979057312, + "logps/chosen": -3.790767192840576, + "logps/rejected": -4.484066486358643, + "loss": 0.5885, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.790767192840576, + "rewards/margins": 0.6932995915412903, + "rewards/rejected": -4.484066486358643, + "sft_loss": 3.834791660308838, + "step": 1045 + }, + { + "epoch": 0.5619668840943302, + "grad_norm": 16.985779571284176, + "learning_rate": 9.76979303654274e-07, + "logits/chosen": 0.011258067563176155, + "logits/rejected": 0.09075259417295456, + "logps/chosen": -4.021671295166016, + "logps/rejected": -4.701707363128662, + "loss": 0.6082, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.021671295166016, + "rewards/margins": 0.6800357103347778, + "rewards/rejected": -4.701707363128662, + "sft_loss": 4.067013740539551, + "step": 1050 + }, + { + "epoch": 0.5646429168757318, + "grad_norm": 17.764033387407114, + "learning_rate": 9.765098658960035e-07, + "logits/chosen": 0.05323363095521927, + "logits/rejected": 0.10457701981067657, + "logps/chosen": -4.006975173950195, + "logps/rejected": -4.602959156036377, + "loss": 0.6252, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.006975173950195, + "rewards/margins": 0.5959838628768921, + "rewards/rejected": -4.602959156036377, + "sft_loss": 4.014351844787598, + "step": 1055 + }, + { + "epoch": 0.5673189496571333, + "grad_norm": 21.003003227643912, + "learning_rate": 9.76035805036924e-07, + "logits/chosen": 0.15413573384284973, + "logits/rejected": 0.2889839708805084, + "logps/chosen": -4.063313961029053, + "logps/rejected": -4.599331855773926, + "loss": 0.6403, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.063313961029053, + "rewards/margins": 0.5360177755355835, + "rewards/rejected": -4.599331855773926, + "sft_loss": 4.007753849029541, + "step": 1060 + }, + { + "epoch": 0.5699949824385349, + "grad_norm": 14.649343042640266, + "learning_rate": 9.755571256763764e-07, + "logits/chosen": 0.16708219051361084, + "logits/rejected": 0.27574026584625244, + "logps/chosen": -3.8334312438964844, + "logps/rejected": -4.484501838684082, + "loss": 0.6036, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.8334312438964844, + "rewards/margins": 0.6510700583457947, + "rewards/rejected": -4.484501838684082, + "sft_loss": 3.9353995323181152, + "step": 1065 + }, + { + "epoch": 0.5726710152199365, + "grad_norm": 11.955416009251634, + "learning_rate": 9.750738324585097e-07, + "logits/chosen": 0.06077583506703377, + "logits/rejected": 0.24156561493873596, + "logps/chosen": -3.7830872535705566, + "logps/rejected": -4.326009273529053, + "loss": 0.6281, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.7830872535705566, + "rewards/margins": 0.5429221391677856, + "rewards/rejected": -4.326009273529053, + "sft_loss": 3.7620186805725098, + "step": 1070 + }, + { + "epoch": 0.5753470480013381, + "grad_norm": 9.182424894003129, + "learning_rate": 9.74585930072237e-07, + "logits/chosen": 0.13045711815357208, + "logits/rejected": 0.22782549262046814, + "logps/chosen": -3.610170841217041, + "logps/rejected": -4.254181861877441, + "loss": 0.6351, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.610170841217041, + "rewards/margins": 0.6440110206604004, + "rewards/rejected": -4.254181861877441, + "sft_loss": 3.6286041736602783, + "step": 1075 + }, + { + "epoch": 0.5780230807827396, + "grad_norm": 12.76745223701187, + "learning_rate": 9.740934232511892e-07, + "logits/chosen": 0.013227030634880066, + "logits/rejected": 0.09067533910274506, + "logps/chosen": -3.8230583667755127, + "logps/rejected": -4.390995025634766, + "loss": 0.6306, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.8230583667755127, + "rewards/margins": 0.5679364204406738, + "rewards/rejected": -4.390995025634766, + "sft_loss": 3.8801512718200684, + "step": 1080 + }, + { + "epoch": 0.5806991135641412, + "grad_norm": 14.137757785460963, + "learning_rate": 9.735963167736698e-07, + "logits/chosen": 0.10038147866725922, + "logits/rejected": 0.23114939033985138, + "logps/chosen": -3.847576856613159, + "logps/rejected": -4.33864164352417, + "loss": 0.6556, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.847576856613159, + "rewards/margins": 0.4910648465156555, + "rewards/rejected": -4.33864164352417, + "sft_loss": 3.834911823272705, + "step": 1085 + }, + { + "epoch": 0.5833751463455428, + "grad_norm": 13.201256161784015, + "learning_rate": 9.730946154626078e-07, + "logits/chosen": 0.13253948092460632, + "logits/rejected": 0.19316302239894867, + "logps/chosen": -3.913498640060425, + "logps/rejected": -4.428316116333008, + "loss": 0.6718, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.913498640060425, + "rewards/margins": 0.5148173570632935, + "rewards/rejected": -4.428316116333008, + "sft_loss": 3.9783337116241455, + "step": 1090 + }, + { + "epoch": 0.5860511791269443, + "grad_norm": 13.821285355110062, + "learning_rate": 9.725883241855117e-07, + "logits/chosen": 0.04290536046028137, + "logits/rejected": 0.16766893863677979, + "logps/chosen": -4.017451286315918, + "logps/rejected": -4.656888008117676, + "loss": 0.6224, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.017451286315918, + "rewards/margins": 0.6394359469413757, + "rewards/rejected": -4.656888008117676, + "sft_loss": 4.158010482788086, + "step": 1095 + }, + { + "epoch": 0.5887272119083459, + "grad_norm": 13.869053395478524, + "learning_rate": 9.720774478544218e-07, + "logits/chosen": 0.10910507291555405, + "logits/rejected": 0.21823792159557343, + "logps/chosen": -3.9000885486602783, + "logps/rejected": -4.603386878967285, + "loss": 0.6111, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.9000885486602783, + "rewards/margins": 0.703298807144165, + "rewards/rejected": -4.603386878967285, + "sft_loss": 3.998074769973755, + "step": 1100 + }, + { + "epoch": 0.5914032446897475, + "grad_norm": 14.52641753953559, + "learning_rate": 9.715619914258624e-07, + "logits/chosen": 0.04893391951918602, + "logits/rejected": 0.1237044706940651, + "logps/chosen": -4.041838645935059, + "logps/rejected": -4.510806083679199, + "loss": 0.6814, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.041838645935059, + "rewards/margins": 0.4689674377441406, + "rewards/rejected": -4.510806083679199, + "sft_loss": 3.9709079265594482, + "step": 1105 + }, + { + "epoch": 0.594079277471149, + "grad_norm": 18.355595445420242, + "learning_rate": 9.710419599007937e-07, + "logits/chosen": 0.07414282858371735, + "logits/rejected": 0.19301927089691162, + "logps/chosen": -3.9801788330078125, + "logps/rejected": -4.495860576629639, + "loss": 0.6375, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.9801788330078125, + "rewards/margins": 0.5156816244125366, + "rewards/rejected": -4.495860576629639, + "sft_loss": 4.034372329711914, + "step": 1110 + }, + { + "epoch": 0.5967553102525506, + "grad_norm": 22.088214219073393, + "learning_rate": 9.705173583245643e-07, + "logits/chosen": 0.0930425375699997, + "logits/rejected": 0.22560659050941467, + "logps/chosen": -3.8769278526306152, + "logps/rejected": -4.395509243011475, + "loss": 0.6778, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.8769278526306152, + "rewards/margins": 0.518581211566925, + "rewards/rejected": -4.395509243011475, + "sft_loss": 3.9343044757843018, + "step": 1115 + }, + { + "epoch": 0.5994313430339522, + "grad_norm": 11.505160450043082, + "learning_rate": 9.699881917868609e-07, + "logits/chosen": 0.007545255124568939, + "logits/rejected": 0.0969097688794136, + "logps/chosen": -3.6723945140838623, + "logps/rejected": -4.290095806121826, + "loss": 0.6219, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.6723945140838623, + "rewards/margins": 0.6177011132240295, + "rewards/rejected": -4.290095806121826, + "sft_loss": 3.7624449729919434, + "step": 1120 + }, + { + "epoch": 0.6021073758153538, + "grad_norm": 14.851585818047047, + "learning_rate": 9.694544654216594e-07, + "logits/chosen": -0.007628323044627905, + "logits/rejected": 0.13381069898605347, + "logps/chosen": -3.7548370361328125, + "logps/rejected": -4.348697185516357, + "loss": 0.6055, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.7548370361328125, + "rewards/margins": 0.5938605666160583, + "rewards/rejected": -4.348697185516357, + "sft_loss": 3.7085232734680176, + "step": 1125 + }, + { + "epoch": 0.6047834085967553, + "grad_norm": 30.9003565366131, + "learning_rate": 9.689161844071755e-07, + "logits/chosen": 0.12496396154165268, + "logits/rejected": 0.18439385294914246, + "logps/chosen": -3.555870532989502, + "logps/rejected": -4.09328031539917, + "loss": 0.6437, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.555870532989502, + "rewards/margins": 0.5374095439910889, + "rewards/rejected": -4.09328031539917, + "sft_loss": 3.5397467613220215, + "step": 1130 + }, + { + "epoch": 0.6074594413781569, + "grad_norm": 13.645012557766098, + "learning_rate": 9.683733539658138e-07, + "logits/chosen": 0.030058467760682106, + "logits/rejected": 0.16403785347938538, + "logps/chosen": -3.690871000289917, + "logps/rejected": -4.289731979370117, + "loss": 0.6233, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.690871000289917, + "rewards/margins": 0.5988608598709106, + "rewards/rejected": -4.289731979370117, + "sft_loss": 3.603515148162842, + "step": 1135 + }, + { + "epoch": 0.6101354741595585, + "grad_norm": 14.376364101322325, + "learning_rate": 9.678259793641178e-07, + "logits/chosen": 0.06398359686136246, + "logits/rejected": 0.09742051362991333, + "logps/chosen": -3.7032439708709717, + "logps/rejected": -4.105428218841553, + "loss": 0.6627, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.7032439708709717, + "rewards/margins": 0.40218430757522583, + "rewards/rejected": -4.105428218841553, + "sft_loss": 3.766770124435425, + "step": 1140 + }, + { + "epoch": 0.61281150694096, + "grad_norm": 13.032289343771657, + "learning_rate": 9.672740659127183e-07, + "logits/chosen": -0.04188089817762375, + "logits/rejected": 0.06649746745824814, + "logps/chosen": -3.700268507003784, + "logps/rejected": -4.40798807144165, + "loss": 0.6147, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.700268507003784, + "rewards/margins": 0.7077193260192871, + "rewards/rejected": -4.40798807144165, + "sft_loss": 3.8044610023498535, + "step": 1145 + }, + { + "epoch": 0.6154875397223616, + "grad_norm": 15.503107221557544, + "learning_rate": 9.667176189662818e-07, + "logits/chosen": -0.032415203750133514, + "logits/rejected": 0.07029138505458832, + "logps/chosen": -3.8163063526153564, + "logps/rejected": -4.493452548980713, + "loss": 0.5884, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.8163063526153564, + "rewards/margins": 0.6771461963653564, + "rewards/rejected": -4.493452548980713, + "sft_loss": 3.8472914695739746, + "step": 1150 + }, + { + "epoch": 0.6181635725037632, + "grad_norm": 10.769101243354882, + "learning_rate": 9.661566439234592e-07, + "logits/chosen": 0.024549299851059914, + "logits/rejected": 0.11254332214593887, + "logps/chosen": -3.799016237258911, + "logps/rejected": -4.320359706878662, + "loss": 0.64, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.799016237258911, + "rewards/margins": 0.5213435888290405, + "rewards/rejected": -4.320359706878662, + "sft_loss": 3.937115430831909, + "step": 1155 + }, + { + "epoch": 0.6208396052851648, + "grad_norm": 12.10636480168215, + "learning_rate": 9.655911462268327e-07, + "logits/chosen": 0.07037197053432465, + "logits/rejected": 0.15609976649284363, + "logps/chosen": -3.7353408336639404, + "logps/rejected": -4.422998905181885, + "loss": 0.5857, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.7353408336639404, + "rewards/margins": 0.6876574158668518, + "rewards/rejected": -4.422998905181885, + "sft_loss": 3.93669056892395, + "step": 1160 + }, + { + "epoch": 0.6235156380665663, + "grad_norm": 13.191549758318944, + "learning_rate": 9.650211313628636e-07, + "logits/chosen": 0.0054580033756792545, + "logits/rejected": 0.08709286153316498, + "logps/chosen": -3.783989429473877, + "logps/rejected": -4.179694652557373, + "loss": 0.6961, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.783989429473877, + "rewards/margins": 0.3957056999206543, + "rewards/rejected": -4.179694652557373, + "sft_loss": 3.9875073432922363, + "step": 1165 + }, + { + "epoch": 0.6261916708479679, + "grad_norm": 13.307987308972477, + "learning_rate": 9.644466048618386e-07, + "logits/chosen": 0.01374664343893528, + "logits/rejected": 0.1347121298313141, + "logps/chosen": -3.83042573928833, + "logps/rejected": -4.345409870147705, + "loss": 0.6669, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.83042573928833, + "rewards/margins": 0.5149842500686646, + "rewards/rejected": -4.345409870147705, + "sft_loss": 3.802509307861328, + "step": 1170 + }, + { + "epoch": 0.6288677036293695, + "grad_norm": 12.39337562882268, + "learning_rate": 9.63867572297816e-07, + "logits/chosen": 0.009365784004330635, + "logits/rejected": 0.1598575860261917, + "logps/chosen": -3.5629711151123047, + "logps/rejected": -4.108077049255371, + "loss": 0.6313, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.5629711151123047, + "rewards/margins": 0.5451056957244873, + "rewards/rejected": -4.108077049255371, + "sft_loss": 3.6623432636260986, + "step": 1175 + }, + { + "epoch": 0.631543736410771, + "grad_norm": 14.120963984377784, + "learning_rate": 9.632840392885727e-07, + "logits/chosen": -0.0034040785394608974, + "logits/rejected": 0.11227305233478546, + "logps/chosen": -3.8640055656433105, + "logps/rejected": -4.4511613845825195, + "loss": 0.6424, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.8640055656433105, + "rewards/margins": 0.5871554613113403, + "rewards/rejected": -4.4511613845825195, + "sft_loss": 3.903380870819092, + "step": 1180 + }, + { + "epoch": 0.6342197691921726, + "grad_norm": 11.539496715161526, + "learning_rate": 9.626960114955483e-07, + "logits/chosen": 0.048967987298965454, + "logits/rejected": 0.16037827730178833, + "logps/chosen": -3.79237699508667, + "logps/rejected": -4.49746036529541, + "loss": 0.5889, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.79237699508667, + "rewards/margins": 0.7050831913948059, + "rewards/rejected": -4.49746036529541, + "sft_loss": 3.8400700092315674, + "step": 1185 + }, + { + "epoch": 0.6368958019735742, + "grad_norm": 16.150703871819903, + "learning_rate": 9.621034946237909e-07, + "logits/chosen": -0.017244238406419754, + "logits/rejected": 0.08983320742845535, + "logps/chosen": -3.9456849098205566, + "logps/rejected": -4.6313605308532715, + "loss": 0.5918, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.9456849098205566, + "rewards/margins": 0.6856756210327148, + "rewards/rejected": -4.6313605308532715, + "sft_loss": 4.094405651092529, + "step": 1190 + }, + { + "epoch": 0.6395718347549757, + "grad_norm": 12.931679999737772, + "learning_rate": 9.615064944219021e-07, + "logits/chosen": 0.038251686841249466, + "logits/rejected": 0.12875883281230927, + "logps/chosen": -3.786808729171753, + "logps/rejected": -4.453344821929932, + "loss": 0.5906, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.786808729171753, + "rewards/margins": 0.6665353178977966, + "rewards/rejected": -4.453344821929932, + "sft_loss": 3.9314029216766357, + "step": 1195 + }, + { + "epoch": 0.6422478675363773, + "grad_norm": 17.215641703722756, + "learning_rate": 9.609050166819803e-07, + "logits/chosen": -0.005074346903711557, + "logits/rejected": 0.059091150760650635, + "logps/chosen": -3.9514594078063965, + "logps/rejected": -4.556410789489746, + "loss": 0.6292, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.9514594078063965, + "rewards/margins": 0.6049517393112183, + "rewards/rejected": -4.556410789489746, + "sft_loss": 3.9630367755889893, + "step": 1200 + }, + { + "epoch": 0.6422478675363773, + "eval_logits/chosen": 0.2461836338043213, + "eval_logits/rejected": 0.3271760046482086, + "eval_logps/chosen": -3.8912527561187744, + "eval_logps/rejected": -4.595021724700928, + "eval_loss": 0.5942531824111938, + "eval_rewards/accuracies": 0.721068263053894, + "eval_rewards/chosen": -3.8912527561187744, + "eval_rewards/margins": 0.7037691473960876, + "eval_rewards/rejected": -4.595021724700928, + "eval_runtime": 42.9784, + "eval_samples_per_second": 31.295, + "eval_sft_loss": 3.991314649581909, + "eval_steps_per_second": 7.841, + "step": 1200 + }, + { + "epoch": 0.6449239003177789, + "grad_norm": 16.845520637315555, + "learning_rate": 9.602990672395653e-07, + "logits/chosen": -0.09139742702245712, + "logits/rejected": 0.06181849166750908, + "logps/chosen": -3.8685569763183594, + "logps/rejected": -4.536148548126221, + "loss": 0.5905, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.8685569763183594, + "rewards/margins": 0.6675916910171509, + "rewards/rejected": -4.536148548126221, + "sft_loss": 3.933539628982544, + "step": 1205 + }, + { + "epoch": 0.6475999330991805, + "grad_norm": 13.30971595013577, + "learning_rate": 9.59688651973581e-07, + "logits/chosen": 0.0002502262650523335, + "logits/rejected": 0.15695317089557648, + "logps/chosen": -3.790062427520752, + "logps/rejected": -4.361540794372559, + "loss": 0.6162, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.790062427520752, + "rewards/margins": 0.5714787244796753, + "rewards/rejected": -4.361540794372559, + "sft_loss": 3.882965087890625, + "step": 1210 + }, + { + "epoch": 0.650275965880582, + "grad_norm": 13.25167322757119, + "learning_rate": 9.590737768062792e-07, + "logits/chosen": -0.046138983219861984, + "logits/rejected": 0.041084617376327515, + "logps/chosen": -3.9596238136291504, + "logps/rejected": -4.504553318023682, + "loss": 0.6367, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.9596238136291504, + "rewards/margins": 0.5449296236038208, + "rewards/rejected": -4.504553318023682, + "sft_loss": 3.9218735694885254, + "step": 1215 + }, + { + "epoch": 0.6529519986619836, + "grad_norm": 12.924722270033136, + "learning_rate": 9.584544477031816e-07, + "logits/chosen": 0.08105846494436264, + "logits/rejected": 0.16998329758644104, + "logps/chosen": -3.8064639568328857, + "logps/rejected": -4.354712963104248, + "loss": 0.6409, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.8064639568328857, + "rewards/margins": 0.5482488870620728, + "rewards/rejected": -4.354712963104248, + "sft_loss": 3.770095109939575, + "step": 1220 + }, + { + "epoch": 0.6556280314433852, + "grad_norm": 15.684642791031575, + "learning_rate": 9.578306706730215e-07, + "logits/chosen": -0.07153487950563431, + "logits/rejected": 0.09660569578409195, + "logps/chosen": -3.9475014209747314, + "logps/rejected": -4.414361000061035, + "loss": 0.6879, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.9475014209747314, + "rewards/margins": 0.4668591618537903, + "rewards/rejected": -4.414361000061035, + "sft_loss": 3.897876024246216, + "step": 1225 + }, + { + "epoch": 0.6583040642247867, + "grad_norm": 15.03767424806433, + "learning_rate": 9.572024517676865e-07, + "logits/chosen": -0.0005132406949996948, + "logits/rejected": 0.07582568377256393, + "logps/chosen": -3.9208877086639404, + "logps/rejected": -4.437623500823975, + "loss": 0.6546, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.9208877086639404, + "rewards/margins": 0.5167354941368103, + "rewards/rejected": -4.437623500823975, + "sft_loss": 3.898923873901367, + "step": 1230 + }, + { + "epoch": 0.6609800970061883, + "grad_norm": 13.714925325727796, + "learning_rate": 9.565697970821593e-07, + "logits/chosen": 0.01927166059613228, + "logits/rejected": 0.13673178851604462, + "logps/chosen": -3.84504771232605, + "logps/rejected": -4.373183250427246, + "loss": 0.6295, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.84504771232605, + "rewards/margins": 0.5281357765197754, + "rewards/rejected": -4.373183250427246, + "sft_loss": 3.883021593093872, + "step": 1235 + }, + { + "epoch": 0.6636561297875899, + "grad_norm": 10.639540991446472, + "learning_rate": 9.559327127544585e-07, + "logits/chosen": -0.041995711624622345, + "logits/rejected": 0.0656842365860939, + "logps/chosen": -3.769777297973633, + "logps/rejected": -4.308537483215332, + "loss": 0.6232, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.769777297973633, + "rewards/margins": 0.5387598276138306, + "rewards/rejected": -4.308537483215332, + "sft_loss": 3.8884778022766113, + "step": 1240 + }, + { + "epoch": 0.6663321625689914, + "grad_norm": 15.509107129403525, + "learning_rate": 9.552912049655789e-07, + "logits/chosen": 0.004844689276069403, + "logits/rejected": 0.1393628716468811, + "logps/chosen": -3.6201400756835938, + "logps/rejected": -4.24270486831665, + "loss": 0.614, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.6201400756835938, + "rewards/margins": 0.6225649118423462, + "rewards/rejected": -4.24270486831665, + "sft_loss": 3.6522445678710938, + "step": 1245 + }, + { + "epoch": 0.669008195350393, + "grad_norm": 17.954875027648637, + "learning_rate": 9.546452799394315e-07, + "logits/chosen": 0.05819585919380188, + "logits/rejected": 0.19612844288349152, + "logps/chosen": -3.7995667457580566, + "logps/rejected": -4.250999450683594, + "loss": 0.6912, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.7995667457580566, + "rewards/margins": 0.4514332413673401, + "rewards/rejected": -4.250999450683594, + "sft_loss": 3.818459987640381, + "step": 1250 + }, + { + "epoch": 0.6716842281317946, + "grad_norm": 11.497282803803873, + "learning_rate": 9.539949439427846e-07, + "logits/chosen": 0.028142839670181274, + "logits/rejected": 0.12065265327692032, + "logps/chosen": -3.639207363128662, + "logps/rejected": -4.279654502868652, + "loss": 0.5895, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.639207363128662, + "rewards/margins": 0.6404469609260559, + "rewards/rejected": -4.279654502868652, + "sft_loss": 3.8028016090393066, + "step": 1255 + }, + { + "epoch": 0.6743602609131962, + "grad_norm": 10.502555017466102, + "learning_rate": 9.533402032852002e-07, + "logits/chosen": 0.00669657438993454, + "logits/rejected": 0.1235589012503624, + "logps/chosen": -3.6829135417938232, + "logps/rejected": -4.4046783447265625, + "loss": 0.5923, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.6829135417938232, + "rewards/margins": 0.721764862537384, + "rewards/rejected": -4.4046783447265625, + "sft_loss": 3.843273639678955, + "step": 1260 + }, + { + "epoch": 0.6770362936945977, + "grad_norm": 13.183562797082082, + "learning_rate": 9.526810643189754e-07, + "logits/chosen": 0.08398241549730301, + "logits/rejected": 0.2003088742494583, + "logps/chosen": -3.769449234008789, + "logps/rejected": -4.366641998291016, + "loss": 0.6093, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.769449234008789, + "rewards/margins": 0.5971931219100952, + "rewards/rejected": -4.366641998291016, + "sft_loss": 3.8731276988983154, + "step": 1265 + }, + { + "epoch": 0.6797123264759993, + "grad_norm": 13.682588226342201, + "learning_rate": 9.52017533439079e-07, + "logits/chosen": 0.02536749839782715, + "logits/rejected": 0.0881018415093422, + "logps/chosen": -3.840707778930664, + "logps/rejected": -4.396668910980225, + "loss": 0.6311, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.840707778930664, + "rewards/margins": 0.5559613704681396, + "rewards/rejected": -4.396668910980225, + "sft_loss": 3.9200286865234375, + "step": 1270 + }, + { + "epoch": 0.6823883592574009, + "grad_norm": 11.828789139592562, + "learning_rate": 9.513496170830909e-07, + "logits/chosen": -0.011203656904399395, + "logits/rejected": 0.0779070183634758, + "logps/chosen": -3.9936225414276123, + "logps/rejected": -4.510663986206055, + "loss": 0.6742, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.9936225414276123, + "rewards/margins": 0.5170412659645081, + "rewards/rejected": -4.510663986206055, + "sft_loss": 3.9869353771209717, + "step": 1275 + }, + { + "epoch": 0.6850643920388024, + "grad_norm": 17.434450544438157, + "learning_rate": 9.506773217311382e-07, + "logits/chosen": -0.009508686140179634, + "logits/rejected": 0.11930382251739502, + "logps/chosen": -3.8271725177764893, + "logps/rejected": -4.379973411560059, + "loss": 0.6282, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.8271725177764893, + "rewards/margins": 0.5528005361557007, + "rewards/rejected": -4.379973411560059, + "sft_loss": 3.8565165996551514, + "step": 1280 + }, + { + "epoch": 0.687740424820204, + "grad_norm": 15.854921858999106, + "learning_rate": 9.500006539058334e-07, + "logits/chosen": 0.02335262857377529, + "logits/rejected": 0.12421506643295288, + "logps/chosen": -3.8039650917053223, + "logps/rejected": -4.209416389465332, + "loss": 0.6594, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.8039650917053223, + "rewards/margins": 0.4054519534111023, + "rewards/rejected": -4.209416389465332, + "sft_loss": 3.7736048698425293, + "step": 1285 + }, + { + "epoch": 0.6904164576016056, + "grad_norm": 16.02613111579297, + "learning_rate": 9.493196201722109e-07, + "logits/chosen": -0.07322710007429123, + "logits/rejected": 0.04123363643884659, + "logps/chosen": -3.893324613571167, + "logps/rejected": -4.322262763977051, + "loss": 0.6719, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.893324613571167, + "rewards/margins": 0.42893800139427185, + "rewards/rejected": -4.322262763977051, + "sft_loss": 3.895965099334717, + "step": 1290 + }, + { + "epoch": 0.6930924903830072, + "grad_norm": 11.795083046033852, + "learning_rate": 9.486342271376628e-07, + "logits/chosen": -0.014591937884688377, + "logits/rejected": -0.003014406654983759, + "logps/chosen": -3.850743055343628, + "logps/rejected": -4.518239498138428, + "loss": 0.594, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.850743055343628, + "rewards/margins": 0.6674960851669312, + "rewards/rejected": -4.518239498138428, + "sft_loss": 3.830537796020508, + "step": 1295 + }, + { + "epoch": 0.6957685231644087, + "grad_norm": 17.252417092397284, + "learning_rate": 9.479444814518755e-07, + "logits/chosen": -0.015925895422697067, + "logits/rejected": 0.18819871544837952, + "logps/chosen": -3.8830389976501465, + "logps/rejected": -4.609259128570557, + "loss": 0.5934, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.8830389976501465, + "rewards/margins": 0.7262201905250549, + "rewards/rejected": -4.609259128570557, + "sft_loss": 3.906867265701294, + "step": 1300 + }, + { + "epoch": 0.6984445559458103, + "grad_norm": 10.17591526538395, + "learning_rate": 9.472503898067645e-07, + "logits/chosen": 0.08815959841012955, + "logits/rejected": 0.1264752745628357, + "logps/chosen": -3.9305195808410645, + "logps/rejected": -4.479735374450684, + "loss": 0.6485, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.9305195808410645, + "rewards/margins": 0.5492160320281982, + "rewards/rejected": -4.479735374450684, + "sft_loss": 3.8741016387939453, + "step": 1305 + }, + { + "epoch": 0.701120588727212, + "grad_norm": 14.198842418229422, + "learning_rate": 9.465519589364099e-07, + "logits/chosen": 0.06211893633008003, + "logits/rejected": 0.12285672128200531, + "logps/chosen": -3.9210121631622314, + "logps/rejected": -4.57586145401001, + "loss": 0.6043, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.9210121631622314, + "rewards/margins": 0.6548491716384888, + "rewards/rejected": -4.57586145401001, + "sft_loss": 3.9747262001037598, + "step": 1310 + }, + { + "epoch": 0.7037966215086134, + "grad_norm": 18.363856784951615, + "learning_rate": 9.458491956169914e-07, + "logits/chosen": 0.0014732598792761564, + "logits/rejected": 0.11077898740768433, + "logps/chosen": -4.034459590911865, + "logps/rejected": -4.670225620269775, + "loss": 0.6237, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.034459590911865, + "rewards/margins": 0.6357653737068176, + "rewards/rejected": -4.670225620269775, + "sft_loss": 4.082320690155029, + "step": 1315 + }, + { + "epoch": 0.706472654290015, + "grad_norm": 13.852905792988352, + "learning_rate": 9.451421066667215e-07, + "logits/chosen": -0.10090867429971695, + "logits/rejected": 0.035910557955503464, + "logps/chosen": -3.940913677215576, + "logps/rejected": -4.5349016189575195, + "loss": 0.6018, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.940913677215576, + "rewards/margins": 0.5939876437187195, + "rewards/rejected": -4.5349016189575195, + "sft_loss": 4.0024943351745605, + "step": 1320 + }, + { + "epoch": 0.7091486870714167, + "grad_norm": 16.397248975860062, + "learning_rate": 9.444306989457805e-07, + "logits/chosen": 0.003956289496272802, + "logits/rejected": 0.10998527705669403, + "logps/chosen": -3.8461742401123047, + "logps/rejected": -4.399134635925293, + "loss": 0.6786, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.8461742401123047, + "rewards/margins": 0.552960216999054, + "rewards/rejected": -4.399134635925293, + "sft_loss": 3.8427023887634277, + "step": 1325 + }, + { + "epoch": 0.7118247198528181, + "grad_norm": 16.470232882281454, + "learning_rate": 9.437149793562489e-07, + "logits/chosen": -0.0042040422558784485, + "logits/rejected": 0.07606049627065659, + "logps/chosen": -3.7906928062438965, + "logps/rejected": -4.396378993988037, + "loss": 0.6244, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.7906928062438965, + "rewards/margins": 0.6056860685348511, + "rewards/rejected": -4.396378993988037, + "sft_loss": 3.8501904010772705, + "step": 1330 + }, + { + "epoch": 0.7145007526342197, + "grad_norm": 14.169085969543985, + "learning_rate": 9.429949548420417e-07, + "logits/chosen": -0.02851545251905918, + "logits/rejected": 0.04445397108793259, + "logps/chosen": -3.810012102127075, + "logps/rejected": -4.389849662780762, + "loss": 0.633, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.810012102127075, + "rewards/margins": 0.5798380970954895, + "rewards/rejected": -4.389849662780762, + "sft_loss": 3.8390088081359863, + "step": 1335 + }, + { + "epoch": 0.7171767854156214, + "grad_norm": 12.805135756811211, + "learning_rate": 9.422706323888396e-07, + "logits/chosen": 0.01615220494568348, + "logits/rejected": 0.0676146000623703, + "logps/chosen": -3.7289607524871826, + "logps/rejected": -4.293822288513184, + "loss": 0.6407, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.7289607524871826, + "rewards/margins": 0.5648613572120667, + "rewards/rejected": -4.293822288513184, + "sft_loss": 3.74847412109375, + "step": 1340 + }, + { + "epoch": 0.719852818197023, + "grad_norm": 14.163967951041753, + "learning_rate": 9.415420190240225e-07, + "logits/chosen": 0.018159478902816772, + "logits/rejected": 0.15642356872558594, + "logps/chosen": -3.7944741249084473, + "logps/rejected": -4.561646461486816, + "loss": 0.5273, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.7944741249084473, + "rewards/margins": 0.7671729326248169, + "rewards/rejected": -4.561646461486816, + "sft_loss": 3.849836826324463, + "step": 1345 + }, + { + "epoch": 0.7225288509784245, + "grad_norm": 17.43992081275538, + "learning_rate": 9.408091218166002e-07, + "logits/chosen": 0.03253094106912613, + "logits/rejected": 0.08874372392892838, + "logps/chosen": -3.867936611175537, + "logps/rejected": -4.314837455749512, + "loss": 0.6665, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.867936611175537, + "rewards/margins": 0.4469006657600403, + "rewards/rejected": -4.314837455749512, + "sft_loss": 3.9457309246063232, + "step": 1350 + }, + { + "epoch": 0.7252048837598261, + "grad_norm": 18.167281348220694, + "learning_rate": 9.400719478771449e-07, + "logits/chosen": -0.030413394793868065, + "logits/rejected": 0.18014629185199738, + "logps/chosen": -4.015583038330078, + "logps/rejected": -4.593112945556641, + "loss": 0.6096, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.015583038330078, + "rewards/margins": 0.5775297284126282, + "rewards/rejected": -4.593112945556641, + "sft_loss": 3.978579044342041, + "step": 1355 + }, + { + "epoch": 0.7278809165412277, + "grad_norm": 15.974570997045625, + "learning_rate": 9.393305043577209e-07, + "logits/chosen": -0.04908815771341324, + "logits/rejected": 0.03998412936925888, + "logps/chosen": -4.031233310699463, + "logps/rejected": -4.735138893127441, + "loss": 0.5853, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.031233310699463, + "rewards/margins": 0.7039054036140442, + "rewards/rejected": -4.735138893127441, + "sft_loss": 4.132689476013184, + "step": 1360 + }, + { + "epoch": 0.7305569493226292, + "grad_norm": 8.94370333182499, + "learning_rate": 9.38584798451817e-07, + "logits/chosen": -0.02070789411664009, + "logits/rejected": 0.09080308675765991, + "logps/chosen": -3.9505438804626465, + "logps/rejected": -4.525331497192383, + "loss": 0.6236, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.9505438804626465, + "rewards/margins": 0.574787437915802, + "rewards/rejected": -4.525331497192383, + "sft_loss": 3.9485411643981934, + "step": 1365 + }, + { + "epoch": 0.7332329821040308, + "grad_norm": 26.678755989582026, + "learning_rate": 9.37834837394275e-07, + "logits/chosen": -0.03940100595355034, + "logits/rejected": 0.05745028704404831, + "logps/chosen": -3.922192096710205, + "logps/rejected": -4.713294982910156, + "loss": 0.6099, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.922192096710205, + "rewards/margins": 0.7911027073860168, + "rewards/rejected": -4.713294982910156, + "sft_loss": 3.9023406505584717, + "step": 1370 + }, + { + "epoch": 0.7359090148854324, + "grad_norm": 15.42526326583326, + "learning_rate": 9.370806284612203e-07, + "logits/chosen": -0.08443541824817657, + "logits/rejected": 0.0254441536962986, + "logps/chosen": -3.8435254096984863, + "logps/rejected": -4.57711124420166, + "loss": 0.579, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.8435254096984863, + "rewards/margins": 0.73358553647995, + "rewards/rejected": -4.57711124420166, + "sft_loss": 3.8551177978515625, + "step": 1375 + }, + { + "epoch": 0.738585047666834, + "grad_norm": 12.582495846995794, + "learning_rate": 9.363221789699912e-07, + "logits/chosen": -0.0869092345237732, + "logits/rejected": 0.005117898341268301, + "logps/chosen": -3.783353805541992, + "logps/rejected": -4.234916687011719, + "loss": 0.7033, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.783353805541992, + "rewards/margins": 0.4515630304813385, + "rewards/rejected": -4.234916687011719, + "sft_loss": 3.7639663219451904, + "step": 1380 + }, + { + "epoch": 0.7412610804482355, + "grad_norm": 17.41495253319472, + "learning_rate": 9.355594962790682e-07, + "logits/chosen": -0.0802505686879158, + "logits/rejected": 0.007249271962791681, + "logps/chosen": -3.5686416625976562, + "logps/rejected": -4.24164342880249, + "loss": 0.597, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.5686416625976562, + "rewards/margins": 0.6730014085769653, + "rewards/rejected": -4.24164342880249, + "sft_loss": 3.5927505493164062, + "step": 1385 + }, + { + "epoch": 0.7439371132296371, + "grad_norm": 12.243772262322084, + "learning_rate": 9.34792587788002e-07, + "logits/chosen": -0.04855138063430786, + "logits/rejected": 0.0416254922747612, + "logps/chosen": -3.685039520263672, + "logps/rejected": -4.1918182373046875, + "loss": 0.6482, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.685039520263672, + "rewards/margins": 0.5067787170410156, + "rewards/rejected": -4.1918182373046875, + "sft_loss": 3.6339497566223145, + "step": 1390 + }, + { + "epoch": 0.7466131460110387, + "grad_norm": 15.180696863080362, + "learning_rate": 9.34021460937342e-07, + "logits/chosen": -0.03241829201579094, + "logits/rejected": 0.040210507810115814, + "logps/chosen": -3.7386927604675293, + "logps/rejected": -4.187805652618408, + "loss": 0.6493, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.7386927604675293, + "rewards/margins": 0.44911304116249084, + "rewards/rejected": -4.187805652618408, + "sft_loss": 3.7067184448242188, + "step": 1395 + }, + { + "epoch": 0.7492891787924402, + "grad_norm": 8.992479929116369, + "learning_rate": 9.332461232085646e-07, + "logits/chosen": -0.2001856565475464, + "logits/rejected": -0.11754343658685684, + "logps/chosen": -3.778327226638794, + "logps/rejected": -4.2554402351379395, + "loss": 0.6238, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.778327226638794, + "rewards/margins": 0.47711247205734253, + "rewards/rejected": -4.2554402351379395, + "sft_loss": 3.767138719558716, + "step": 1400 + }, + { + "epoch": 0.7519652115738418, + "grad_norm": 11.86927774449835, + "learning_rate": 9.324665821239998e-07, + "logits/chosen": -0.11683745682239532, + "logits/rejected": 0.019382378086447716, + "logps/chosen": -3.5938785076141357, + "logps/rejected": -4.3098320960998535, + "loss": 0.6299, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.5938785076141357, + "rewards/margins": 0.7159535884857178, + "rewards/rejected": -4.3098320960998535, + "sft_loss": 3.6116302013397217, + "step": 1405 + }, + { + "epoch": 0.7546412443552434, + "grad_norm": 15.387992935443528, + "learning_rate": 9.316828452467583e-07, + "logits/chosen": -0.10649319738149643, + "logits/rejected": 0.005260472185909748, + "logps/chosen": -3.74613618850708, + "logps/rejected": -4.398713111877441, + "loss": 0.5803, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.74613618850708, + "rewards/margins": 0.6525768041610718, + "rewards/rejected": -4.398713111877441, + "sft_loss": 3.7421631813049316, + "step": 1410 + }, + { + "epoch": 0.7573172771366449, + "grad_norm": 21.090362384980526, + "learning_rate": 9.30894920180659e-07, + "logits/chosen": -0.041864484548568726, + "logits/rejected": 0.04505940526723862, + "logps/chosen": -3.701702833175659, + "logps/rejected": -4.130052089691162, + "loss": 0.6612, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.701702833175659, + "rewards/margins": 0.42834967374801636, + "rewards/rejected": -4.130052089691162, + "sft_loss": 3.621180772781372, + "step": 1415 + }, + { + "epoch": 0.7599933099180465, + "grad_norm": 11.325300022922676, + "learning_rate": 9.301028145701543e-07, + "logits/chosen": -0.04674559831619263, + "logits/rejected": 0.055222172290086746, + "logps/chosen": -3.793757915496826, + "logps/rejected": -4.501084804534912, + "loss": 0.6039, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.793757915496826, + "rewards/margins": 0.7073268890380859, + "rewards/rejected": -4.501084804534912, + "sft_loss": 3.869464874267578, + "step": 1420 + }, + { + "epoch": 0.7626693426994481, + "grad_norm": 9.102658850824849, + "learning_rate": 9.293065361002563e-07, + "logits/chosen": -0.03859156370162964, + "logits/rejected": 0.033886075019836426, + "logps/chosen": -3.7963879108428955, + "logps/rejected": -4.407050609588623, + "loss": 0.6331, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.7963879108428955, + "rewards/margins": 0.6106627583503723, + "rewards/rejected": -4.407050609588623, + "sft_loss": 3.7996819019317627, + "step": 1425 + }, + { + "epoch": 0.7653453754808497, + "grad_norm": 14.4711171960013, + "learning_rate": 9.285060924964622e-07, + "logits/chosen": -0.14051930606365204, + "logits/rejected": -0.05075984075665474, + "logps/chosen": -3.8657398223876953, + "logps/rejected": -4.398106575012207, + "loss": 0.6229, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.8657398223876953, + "rewards/margins": 0.5323665738105774, + "rewards/rejected": -4.398106575012207, + "sft_loss": 3.8228697776794434, + "step": 1430 + }, + { + "epoch": 0.7680214082622512, + "grad_norm": 12.52167523723316, + "learning_rate": 9.277014915246792e-07, + "logits/chosen": -0.0580422468483448, + "logits/rejected": -0.019903894513845444, + "logps/chosen": -3.8900628089904785, + "logps/rejected": -4.588275909423828, + "loss": 0.5952, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.8900628089904785, + "rewards/margins": 0.6982136368751526, + "rewards/rejected": -4.588275909423828, + "sft_loss": 3.9355030059814453, + "step": 1435 + }, + { + "epoch": 0.7706974410436528, + "grad_norm": 9.415628292863678, + "learning_rate": 9.268927409911498e-07, + "logits/chosen": -0.1376069039106369, + "logits/rejected": -0.05684811994433403, + "logps/chosen": -3.823603391647339, + "logps/rejected": -4.432217121124268, + "loss": 0.6246, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.823603391647339, + "rewards/margins": 0.6086133122444153, + "rewards/rejected": -4.432217121124268, + "sft_loss": 3.8841118812561035, + "step": 1440 + }, + { + "epoch": 0.7733734738250544, + "grad_norm": 13.700848888870734, + "learning_rate": 9.260798487423749e-07, + "logits/chosen": -0.1622910499572754, + "logits/rejected": -0.034788988530635834, + "logps/chosen": -3.89607310295105, + "logps/rejected": -4.454383850097656, + "loss": 0.6166, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.89607310295105, + "rewards/margins": 0.5583103895187378, + "rewards/rejected": -4.454383850097656, + "sft_loss": 3.9466094970703125, + "step": 1445 + }, + { + "epoch": 0.7760495066064559, + "grad_norm": 15.693301225581965, + "learning_rate": 9.252628226650389e-07, + "logits/chosen": -0.08934115618467331, + "logits/rejected": -0.002043300773948431, + "logps/chosen": -3.997868776321411, + "logps/rejected": -4.518886566162109, + "loss": 0.6649, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.997868776321411, + "rewards/margins": 0.5210171341896057, + "rewards/rejected": -4.518886566162109, + "sft_loss": 4.018801689147949, + "step": 1450 + }, + { + "epoch": 0.7787255393878575, + "grad_norm": 14.661747537082288, + "learning_rate": 9.244416706859321e-07, + "logits/chosen": -0.09901993721723557, + "logits/rejected": 0.014294229447841644, + "logps/chosen": -3.921424388885498, + "logps/rejected": -4.539112567901611, + "loss": 0.6357, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.921424388885498, + "rewards/margins": 0.617688775062561, + "rewards/rejected": -4.539112567901611, + "sft_loss": 3.9897141456604004, + "step": 1455 + }, + { + "epoch": 0.7814015721692591, + "grad_norm": 12.04009641192229, + "learning_rate": 9.23616400771875e-07, + "logits/chosen": -0.09351952373981476, + "logits/rejected": 0.015524087473750114, + "logps/chosen": -3.9031898975372314, + "logps/rejected": -4.5179572105407715, + "loss": 0.6171, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.9031898975372314, + "rewards/margins": 0.6147674322128296, + "rewards/rejected": -4.5179572105407715, + "sft_loss": 3.8708298206329346, + "step": 1460 + }, + { + "epoch": 0.7840776049506607, + "grad_norm": 10.032646768408965, + "learning_rate": 9.227870209296395e-07, + "logits/chosen": -0.1010308712720871, + "logits/rejected": -0.005897210445255041, + "logps/chosen": -3.9854037761688232, + "logps/rejected": -4.480762958526611, + "loss": 0.6496, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.9854037761688232, + "rewards/margins": 0.49535924196243286, + "rewards/rejected": -4.480762958526611, + "sft_loss": 3.9827957153320312, + "step": 1465 + }, + { + "epoch": 0.7867536377320622, + "grad_norm": 9.168216544574479, + "learning_rate": 9.219535392058728e-07, + "logits/chosen": -0.13384617865085602, + "logits/rejected": -0.10935819149017334, + "logps/chosen": -3.9713332653045654, + "logps/rejected": -4.536910533905029, + "loss": 0.6398, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.9713332653045654, + "rewards/margins": 0.5655772089958191, + "rewards/rejected": -4.536910533905029, + "sft_loss": 3.9855117797851562, + "step": 1470 + }, + { + "epoch": 0.7894296705134638, + "grad_norm": 11.98976821599669, + "learning_rate": 9.211159636870181e-07, + "logits/chosen": -0.16908545792102814, + "logits/rejected": -0.044844694435596466, + "logps/chosen": -4.051001071929932, + "logps/rejected": -4.692683219909668, + "loss": 0.6194, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.051001071929932, + "rewards/margins": 0.6416821479797363, + "rewards/rejected": -4.692683219909668, + "sft_loss": 4.038779258728027, + "step": 1475 + }, + { + "epoch": 0.7921057032948654, + "grad_norm": 13.081822544871454, + "learning_rate": 9.202743024992367e-07, + "logits/chosen": -0.04509899392724037, + "logits/rejected": 0.01662004180252552, + "logps/chosen": -3.9795117378234863, + "logps/rejected": -4.6536149978637695, + "loss": 0.6271, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.9795117378234863, + "rewards/margins": 0.6741029620170593, + "rewards/rejected": -4.6536149978637695, + "sft_loss": 4.023516654968262, + "step": 1480 + }, + { + "epoch": 0.7947817360762669, + "grad_norm": 14.210973536735354, + "learning_rate": 9.194285638083293e-07, + "logits/chosen": -0.07396290451288223, + "logits/rejected": 0.04083425551652908, + "logps/chosen": -4.056087493896484, + "logps/rejected": -4.83945369720459, + "loss": 0.5591, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.056087493896484, + "rewards/margins": 0.7833663821220398, + "rewards/rejected": -4.83945369720459, + "sft_loss": 4.060484409332275, + "step": 1485 + }, + { + "epoch": 0.7974577688576685, + "grad_norm": 15.471031754520801, + "learning_rate": 9.185787558196562e-07, + "logits/chosen": -0.12588518857955933, + "logits/rejected": -0.04100862145423889, + "logps/chosen": -4.009485244750977, + "logps/rejected": -4.681950569152832, + "loss": 0.6228, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.009485244750977, + "rewards/margins": 0.6724649667739868, + "rewards/rejected": -4.681950569152832, + "sft_loss": 4.088533401489258, + "step": 1490 + }, + { + "epoch": 0.8001338016390701, + "grad_norm": 14.561177242848148, + "learning_rate": 9.177248867780583e-07, + "logits/chosen": -0.09878459572792053, + "logits/rejected": -0.009741676971316338, + "logps/chosen": -4.209200382232666, + "logps/rejected": -4.6544108390808105, + "loss": 0.6626, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.209200382232666, + "rewards/margins": 0.4452107548713684, + "rewards/rejected": -4.6544108390808105, + "sft_loss": 4.294915676116943, + "step": 1495 + }, + { + "epoch": 0.8028098344204716, + "grad_norm": 12.949238093080922, + "learning_rate": 9.168669649677769e-07, + "logits/chosen": -0.12456649541854858, + "logits/rejected": -0.03368541598320007, + "logps/chosen": -4.077846527099609, + "logps/rejected": -4.671810150146484, + "loss": 0.6596, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.077846527099609, + "rewards/margins": 0.593963086605072, + "rewards/rejected": -4.671810150146484, + "sft_loss": 4.222781658172607, + "step": 1500 + }, + { + "epoch": 0.8054858672018732, + "grad_norm": 12.368100727938941, + "learning_rate": 9.16004998712373e-07, + "logits/chosen": -0.07127787172794342, + "logits/rejected": -0.024446146562695503, + "logps/chosen": -4.05262565612793, + "logps/rejected": -4.659695625305176, + "loss": 0.6252, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.05262565612793, + "rewards/margins": 0.6070703268051147, + "rewards/rejected": -4.659695625305176, + "sft_loss": 4.069108963012695, + "step": 1505 + }, + { + "epoch": 0.8081618999832748, + "grad_norm": 12.448223400661613, + "learning_rate": 9.151389963746472e-07, + "logits/chosen": -0.12020577490329742, + "logits/rejected": 0.08523955196142197, + "logps/chosen": -3.913705348968506, + "logps/rejected": -4.676518440246582, + "loss": 0.5621, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.913705348968506, + "rewards/margins": 0.7628134489059448, + "rewards/rejected": -4.676518440246582, + "sft_loss": 3.920109272003174, + "step": 1510 + }, + { + "epoch": 0.8108379327646764, + "grad_norm": 12.886115881341064, + "learning_rate": 9.142689663565577e-07, + "logits/chosen": -0.06894917786121368, + "logits/rejected": -0.016827654093503952, + "logps/chosen": -3.890366792678833, + "logps/rejected": -4.506114482879639, + "loss": 0.6142, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.890366792678833, + "rewards/margins": 0.6157475113868713, + "rewards/rejected": -4.506114482879639, + "sft_loss": 3.963930606842041, + "step": 1515 + }, + { + "epoch": 0.8135139655460779, + "grad_norm": 12.071807685749667, + "learning_rate": 9.133949170991397e-07, + "logits/chosen": -0.10245764255523682, + "logits/rejected": -0.023907829076051712, + "logps/chosen": -3.8091113567352295, + "logps/rejected": -4.430354118347168, + "loss": 0.5976, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.8091113567352295, + "rewards/margins": 0.6212425827980042, + "rewards/rejected": -4.430354118347168, + "sft_loss": 3.9815516471862793, + "step": 1520 + }, + { + "epoch": 0.8161899983274795, + "grad_norm": 10.834734052131708, + "learning_rate": 9.125168570824231e-07, + "logits/chosen": -0.13426610827445984, + "logits/rejected": -0.01993221417069435, + "logps/chosen": -3.874058961868286, + "logps/rejected": -4.447755336761475, + "loss": 0.6235, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.874058961868286, + "rewards/margins": 0.5736962556838989, + "rewards/rejected": -4.447755336761475, + "sft_loss": 3.925154447555542, + "step": 1525 + }, + { + "epoch": 0.8188660311088811, + "grad_norm": 30.6039357353955, + "learning_rate": 9.116347948253496e-07, + "logits/chosen": -0.1942942887544632, + "logits/rejected": -0.08810271322727203, + "logps/chosen": -3.9921023845672607, + "logps/rejected": -4.488032341003418, + "loss": 0.637, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.9921023845672607, + "rewards/margins": 0.49593037366867065, + "rewards/rejected": -4.488032341003418, + "sft_loss": 4.042372703552246, + "step": 1530 + }, + { + "epoch": 0.8215420638902826, + "grad_norm": 12.922402171519687, + "learning_rate": 9.107487388856916e-07, + "logits/chosen": -0.17460720241069794, + "logits/rejected": -0.05064401775598526, + "logps/chosen": -3.8689498901367188, + "logps/rejected": -4.4816694259643555, + "loss": 0.5821, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.8689498901367188, + "rewards/margins": 0.6127195358276367, + "rewards/rejected": -4.4816694259643555, + "sft_loss": 3.943500518798828, + "step": 1535 + }, + { + "epoch": 0.8242180966716842, + "grad_norm": 16.343723546981533, + "learning_rate": 9.098586978599673e-07, + "logits/chosen": -0.10426691919565201, + "logits/rejected": -0.00016860663890838623, + "logps/chosen": -3.905703067779541, + "logps/rejected": -4.731764793395996, + "loss": 0.5837, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.905703067779541, + "rewards/margins": 0.8260625004768372, + "rewards/rejected": -4.731764793395996, + "sft_loss": 3.9430885314941406, + "step": 1540 + }, + { + "epoch": 0.8268941294530858, + "grad_norm": 10.650171573281304, + "learning_rate": 9.089646803833588e-07, + "logits/chosen": -0.11272887140512466, + "logits/rejected": -0.01053983997553587, + "logps/chosen": -3.8744194507598877, + "logps/rejected": -4.515874862670898, + "loss": 0.6116, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.8744194507598877, + "rewards/margins": 0.6414551734924316, + "rewards/rejected": -4.515874862670898, + "sft_loss": 3.9839954376220703, + "step": 1545 + }, + { + "epoch": 0.8295701622344873, + "grad_norm": 11.204609395323448, + "learning_rate": 9.080666951296276e-07, + "logits/chosen": -0.2166566550731659, + "logits/rejected": -0.030534937977790833, + "logps/chosen": -3.8562233448028564, + "logps/rejected": -4.782811164855957, + "loss": 0.5023, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.8562233448028564, + "rewards/margins": 0.9265871047973633, + "rewards/rejected": -4.782811164855957, + "sft_loss": 3.879866123199463, + "step": 1550 + }, + { + "epoch": 0.8322461950158889, + "grad_norm": 11.280835645464546, + "learning_rate": 9.071647508110305e-07, + "logits/chosen": -0.17664167284965515, + "logits/rejected": -0.0007077917689457536, + "logps/chosen": -3.9161293506622314, + "logps/rejected": -4.722474098205566, + "loss": 0.6143, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.9161293506622314, + "rewards/margins": 0.8063453435897827, + "rewards/rejected": -4.722474098205566, + "sft_loss": 3.895829677581787, + "step": 1555 + }, + { + "epoch": 0.8349222277972905, + "grad_norm": 12.603156990040194, + "learning_rate": 9.062588561782354e-07, + "logits/chosen": -0.10256105661392212, + "logits/rejected": -0.045074738562107086, + "logps/chosen": -4.097698211669922, + "logps/rejected": -4.654773712158203, + "loss": 0.6505, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.097698211669922, + "rewards/margins": 0.5570759177207947, + "rewards/rejected": -4.654773712158203, + "sft_loss": 4.164440155029297, + "step": 1560 + }, + { + "epoch": 0.8375982605786921, + "grad_norm": 10.578843607292864, + "learning_rate": 9.053490200202358e-07, + "logits/chosen": -0.06902100145816803, + "logits/rejected": 0.005874344613403082, + "logps/chosen": -4.0252180099487305, + "logps/rejected": -4.597375392913818, + "loss": 0.6331, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.0252180099487305, + "rewards/margins": 0.5721582174301147, + "rewards/rejected": -4.597375392913818, + "sft_loss": 4.069859504699707, + "step": 1565 + }, + { + "epoch": 0.8402742933600936, + "grad_norm": 16.18310307037792, + "learning_rate": 9.044352511642661e-07, + "logits/chosen": -0.0919872522354126, + "logits/rejected": -0.05589014291763306, + "logps/chosen": -4.019768238067627, + "logps/rejected": -4.503331184387207, + "loss": 0.7016, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.019768238067627, + "rewards/margins": 0.48356255888938904, + "rewards/rejected": -4.503331184387207, + "sft_loss": 4.123537063598633, + "step": 1570 + }, + { + "epoch": 0.8429503261414952, + "grad_norm": 10.830456959519374, + "learning_rate": 9.03517558475716e-07, + "logits/chosen": -0.08890876919031143, + "logits/rejected": -0.015451954677700996, + "logps/chosen": -3.699070453643799, + "logps/rejected": -4.209559440612793, + "loss": 0.626, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.699070453643799, + "rewards/margins": 0.5104890465736389, + "rewards/rejected": -4.209559440612793, + "sft_loss": 3.7042746543884277, + "step": 1575 + }, + { + "epoch": 0.8456263589228968, + "grad_norm": 9.401950567422222, + "learning_rate": 9.025959508580436e-07, + "logits/chosen": -0.03602803871035576, + "logits/rejected": 0.11538251489400864, + "logps/chosen": -3.798994541168213, + "logps/rejected": -4.442479133605957, + "loss": 0.5996, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.798994541168213, + "rewards/margins": 0.6434845328330994, + "rewards/rejected": -4.442479133605957, + "sft_loss": 3.7785956859588623, + "step": 1580 + }, + { + "epoch": 0.8483023917042983, + "grad_norm": 10.214149261214663, + "learning_rate": 9.016704372526905e-07, + "logits/chosen": -0.08980683982372284, + "logits/rejected": 0.01351415365934372, + "logps/chosen": -3.624281644821167, + "logps/rejected": -4.2779221534729, + "loss": 0.5905, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.624281644821167, + "rewards/margins": 0.6536401510238647, + "rewards/rejected": -4.2779221534729, + "sft_loss": 3.7103257179260254, + "step": 1585 + }, + { + "epoch": 0.8509784244856999, + "grad_norm": 14.182967548181901, + "learning_rate": 9.007410266389934e-07, + "logits/chosen": -0.15173068642616272, + "logits/rejected": -0.08541973680257797, + "logps/chosen": -3.6514763832092285, + "logps/rejected": -4.157805442810059, + "loss": 0.6305, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.6514763832092285, + "rewards/margins": 0.5063292384147644, + "rewards/rejected": -4.157805442810059, + "sft_loss": 3.675691604614258, + "step": 1590 + }, + { + "epoch": 0.8536544572671015, + "grad_norm": 16.02994223042789, + "learning_rate": 8.998077280340981e-07, + "logits/chosen": -0.04729296639561653, + "logits/rejected": -0.004415331874042749, + "logps/chosen": -3.785186767578125, + "logps/rejected": -4.244810104370117, + "loss": 0.6665, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.785186767578125, + "rewards/margins": 0.45962339639663696, + "rewards/rejected": -4.244810104370117, + "sft_loss": 3.740236759185791, + "step": 1595 + }, + { + "epoch": 0.8563304900485031, + "grad_norm": 10.885918050761497, + "learning_rate": 8.988705504928722e-07, + "logits/chosen": -0.14420801401138306, + "logits/rejected": -0.02436070702970028, + "logps/chosen": -3.710519313812256, + "logps/rejected": -4.561938762664795, + "loss": 0.5282, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.710519313812256, + "rewards/margins": 0.8514199256896973, + "rewards/rejected": -4.561938762664795, + "sft_loss": 3.786695957183838, + "step": 1600 + }, + { + "epoch": 0.8563304900485031, + "eval_logits/chosen": 0.14557655155658722, + "eval_logits/rejected": 0.21837201714515686, + "eval_logps/chosen": -3.799365758895874, + "eval_logps/rejected": -4.488193988800049, + "eval_loss": 0.5851835608482361, + "eval_rewards/accuracies": 0.7173590660095215, + "eval_rewards/chosen": -3.799365758895874, + "eval_rewards/margins": 0.68882817029953, + "eval_rewards/rejected": -4.488193988800049, + "eval_runtime": 42.9857, + "eval_samples_per_second": 31.289, + "eval_sft_loss": 3.8604371547698975, + "eval_steps_per_second": 7.84, + "step": 1600 + }, + { + "epoch": 0.8590065228299046, + "grad_norm": 12.27828649507042, + "learning_rate": 8.979295031078157e-07, + "logits/chosen": -0.10497484356164932, + "logits/rejected": 0.056699056178331375, + "logps/chosen": -3.9257049560546875, + "logps/rejected": -4.624436855316162, + "loss": 0.5789, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.9257049560546875, + "rewards/margins": 0.6987317204475403, + "rewards/rejected": -4.624436855316162, + "sft_loss": 3.912734270095825, + "step": 1605 + }, + { + "epoch": 0.8616825556113062, + "grad_norm": 11.632477594854953, + "learning_rate": 8.969845950089751e-07, + "logits/chosen": -0.12709620594978333, + "logits/rejected": 0.0015153981512412429, + "logps/chosen": -3.961102247238159, + "logps/rejected": -4.7628655433654785, + "loss": 0.5721, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.961102247238159, + "rewards/margins": 0.8017629384994507, + "rewards/rejected": -4.7628655433654785, + "sft_loss": 4.05449914932251, + "step": 1610 + }, + { + "epoch": 0.8643585883927078, + "grad_norm": 17.797718688717936, + "learning_rate": 8.960358353638526e-07, + "logits/chosen": -0.07080388069152832, + "logits/rejected": 0.014102371409535408, + "logps/chosen": -4.029933452606201, + "logps/rejected": -4.655080318450928, + "loss": 0.6526, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.029933452606201, + "rewards/margins": 0.6251465082168579, + "rewards/rejected": -4.655080318450928, + "sft_loss": 4.067551136016846, + "step": 1615 + }, + { + "epoch": 0.8670346211741093, + "grad_norm": 13.620376101090379, + "learning_rate": 8.950832333773184e-07, + "logits/chosen": -0.07276565581560135, + "logits/rejected": 0.03689710050821304, + "logps/chosen": -3.949676513671875, + "logps/rejected": -4.511597156524658, + "loss": 0.6814, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.949676513671875, + "rewards/margins": 0.5619208812713623, + "rewards/rejected": -4.511597156524658, + "sft_loss": 4.0169172286987305, + "step": 1620 + }, + { + "epoch": 0.869710653955511, + "grad_norm": 15.277906245685788, + "learning_rate": 8.941267982915213e-07, + "logits/chosen": -0.05725591629743576, + "logits/rejected": -0.01894243434071541, + "logps/chosen": -3.93320894241333, + "logps/rejected": -4.239479064941406, + "loss": 0.7493, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.93320894241333, + "rewards/margins": 0.30627012252807617, + "rewards/rejected": -4.239479064941406, + "sft_loss": 3.906848192214966, + "step": 1625 + }, + { + "epoch": 0.8723866867369126, + "grad_norm": 10.028521696122647, + "learning_rate": 8.931665393857983e-07, + "logits/chosen": -0.05407093092799187, + "logits/rejected": 0.039415888488292694, + "logps/chosen": -3.6694939136505127, + "logps/rejected": -4.256445407867432, + "loss": 0.6118, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.6694939136505127, + "rewards/margins": 0.5869513750076294, + "rewards/rejected": -4.256445407867432, + "sft_loss": 3.7156128883361816, + "step": 1630 + }, + { + "epoch": 0.875062719518314, + "grad_norm": 9.740621126018869, + "learning_rate": 8.922024659765861e-07, + "logits/chosen": -0.13916271924972534, + "logits/rejected": -0.03805696219205856, + "logps/chosen": -3.4495654106140137, + "logps/rejected": -4.118634223937988, + "loss": 0.579, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.4495654106140137, + "rewards/margins": 0.6690683364868164, + "rewards/rejected": -4.118634223937988, + "sft_loss": 3.4922516345977783, + "step": 1635 + }, + { + "epoch": 0.8777387522997157, + "grad_norm": 16.928582236655778, + "learning_rate": 8.912345874173288e-07, + "logits/chosen": -0.14447686076164246, + "logits/rejected": -0.056305646896362305, + "logps/chosen": -3.626981258392334, + "logps/rejected": -4.2726945877075195, + "loss": 0.6124, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.626981258392334, + "rewards/margins": 0.6457129716873169, + "rewards/rejected": -4.2726945877075195, + "sft_loss": 3.6541404724121094, + "step": 1640 + }, + { + "epoch": 0.8804147850811173, + "grad_norm": 11.880923542733687, + "learning_rate": 8.902629130983885e-07, + "logits/chosen": -0.10572276264429092, + "logits/rejected": -0.07693799585103989, + "logps/chosen": -3.740095853805542, + "logps/rejected": -4.2216291427612305, + "loss": 0.6411, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.740095853805542, + "rewards/margins": 0.48153313994407654, + "rewards/rejected": -4.2216291427612305, + "sft_loss": 3.799666166305542, + "step": 1645 + }, + { + "epoch": 0.8830908178625189, + "grad_norm": 13.283487323354745, + "learning_rate": 8.892874524469537e-07, + "logits/chosen": -0.010224530473351479, + "logits/rejected": 0.01489633321762085, + "logps/chosen": -3.5730910301208496, + "logps/rejected": -4.23054313659668, + "loss": 0.5645, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.5730910301208496, + "rewards/margins": 0.6574516296386719, + "rewards/rejected": -4.23054313659668, + "sft_loss": 3.5423073768615723, + "step": 1650 + }, + { + "epoch": 0.8857668506439204, + "grad_norm": 12.396900074714226, + "learning_rate": 8.883082149269478e-07, + "logits/chosen": -0.13912677764892578, + "logits/rejected": -0.05157994478940964, + "logps/chosen": -3.792473554611206, + "logps/rejected": -4.402674674987793, + "loss": 0.5987, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.792473554611206, + "rewards/margins": 0.6102014183998108, + "rewards/rejected": -4.402674674987793, + "sft_loss": 3.8069300651550293, + "step": 1655 + }, + { + "epoch": 0.888442883425322, + "grad_norm": 16.16834676319544, + "learning_rate": 8.873252100389377e-07, + "logits/chosen": -0.06999781727790833, + "logits/rejected": -0.01696675829589367, + "logps/chosen": -3.7842602729797363, + "logps/rejected": -4.433149337768555, + "loss": 0.583, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.7842602729797363, + "rewards/margins": 0.6488891839981079, + "rewards/rejected": -4.433149337768555, + "sft_loss": 3.8168907165527344, + "step": 1660 + }, + { + "epoch": 0.8911189162067236, + "grad_norm": 18.052223178294405, + "learning_rate": 8.863384473200411e-07, + "logits/chosen": -0.09506978839635849, + "logits/rejected": -0.04939929395914078, + "logps/chosen": -4.108767032623291, + "logps/rejected": -4.559029579162598, + "loss": 0.6721, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.108767032623291, + "rewards/margins": 0.45026326179504395, + "rewards/rejected": -4.559029579162598, + "sft_loss": 4.11429500579834, + "step": 1665 + }, + { + "epoch": 0.8937949489881251, + "grad_norm": 12.903552574200932, + "learning_rate": 8.853479363438342e-07, + "logits/chosen": -0.02410072088241577, + "logits/rejected": 0.08524173498153687, + "logps/chosen": -4.105332374572754, + "logps/rejected": -4.600467681884766, + "loss": 0.6741, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.105332374572754, + "rewards/margins": 0.49513569474220276, + "rewards/rejected": -4.600467681884766, + "sft_loss": 4.071771621704102, + "step": 1670 + }, + { + "epoch": 0.8964709817695267, + "grad_norm": 15.292369300212767, + "learning_rate": 8.843536867202588e-07, + "logits/chosen": -0.06857451051473618, + "logits/rejected": 0.05873778462409973, + "logps/chosen": -3.989157199859619, + "logps/rejected": -4.681168079376221, + "loss": 0.602, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.989157199859619, + "rewards/margins": 0.6920109391212463, + "rewards/rejected": -4.681168079376221, + "sft_loss": 4.073268890380859, + "step": 1675 + }, + { + "epoch": 0.8991470145509283, + "grad_norm": 17.441981364376606, + "learning_rate": 8.833557080955292e-07, + "logits/chosen": -0.14238521456718445, + "logits/rejected": -0.06444496661424637, + "logps/chosen": -3.9185664653778076, + "logps/rejected": -4.394036293029785, + "loss": 0.659, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.9185664653778076, + "rewards/margins": 0.47546929121017456, + "rewards/rejected": -4.394036293029785, + "sft_loss": 3.975456953048706, + "step": 1680 + }, + { + "epoch": 0.9018230473323299, + "grad_norm": 15.180284082109292, + "learning_rate": 8.823540101520381e-07, + "logits/chosen": -0.06266282498836517, + "logits/rejected": 0.06844954192638397, + "logps/chosen": -3.8134632110595703, + "logps/rejected": -4.475861549377441, + "loss": 0.6285, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.8134632110595703, + "rewards/margins": 0.6623983979225159, + "rewards/rejected": -4.475861549377441, + "sft_loss": 3.880690097808838, + "step": 1685 + }, + { + "epoch": 0.9044990801137314, + "grad_norm": 15.740476534234624, + "learning_rate": 8.813486026082637e-07, + "logits/chosen": -0.10209021717309952, + "logits/rejected": 0.04368484765291214, + "logps/chosen": -3.8531203269958496, + "logps/rejected": -4.5683064460754395, + "loss": 0.5769, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.8531203269958496, + "rewards/margins": 0.7151857614517212, + "rewards/rejected": -4.5683064460754395, + "sft_loss": 3.9027419090270996, + "step": 1690 + }, + { + "epoch": 0.907175112895133, + "grad_norm": 21.63075410350149, + "learning_rate": 8.803394952186742e-07, + "logits/chosen": -0.20173931121826172, + "logits/rejected": -0.07687047868967056, + "logps/chosen": -3.8959603309631348, + "logps/rejected": -4.4894514083862305, + "loss": 0.6031, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.8959603309631348, + "rewards/margins": 0.5934913754463196, + "rewards/rejected": -4.4894514083862305, + "sft_loss": 3.912142276763916, + "step": 1695 + }, + { + "epoch": 0.9098511456765346, + "grad_norm": 13.107913392579063, + "learning_rate": 8.793266977736342e-07, + "logits/chosen": -0.0669541209936142, + "logits/rejected": -0.09892503172159195, + "logps/chosen": -3.954420566558838, + "logps/rejected": -4.294497966766357, + "loss": 0.6963, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.954420566558838, + "rewards/margins": 0.3400774300098419, + "rewards/rejected": -4.294497966766357, + "sft_loss": 4.035944938659668, + "step": 1700 + }, + { + "epoch": 0.9125271784579361, + "grad_norm": 13.270751912496983, + "learning_rate": 8.783102200993085e-07, + "logits/chosen": -0.06592990458011627, + "logits/rejected": 0.03833655267953873, + "logps/chosen": -3.835791826248169, + "logps/rejected": -4.498007774353027, + "loss": 0.589, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.835791826248169, + "rewards/margins": 0.6622158288955688, + "rewards/rejected": -4.498007774353027, + "sft_loss": 3.873680830001831, + "step": 1705 + }, + { + "epoch": 0.9152032112393377, + "grad_norm": 12.413522513995133, + "learning_rate": 8.772900720575683e-07, + "logits/chosen": -0.09897182881832123, + "logits/rejected": -0.014216387644410133, + "logps/chosen": -4.089080810546875, + "logps/rejected": -4.6123456954956055, + "loss": 0.6242, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.089080810546875, + "rewards/margins": 0.52326500415802, + "rewards/rejected": -4.6123456954956055, + "sft_loss": 4.189183235168457, + "step": 1710 + }, + { + "epoch": 0.9178792440207393, + "grad_norm": 17.5216145626001, + "learning_rate": 8.762662635458944e-07, + "logits/chosen": -0.11287244409322739, + "logits/rejected": 0.005205526947975159, + "logps/chosen": -4.19803524017334, + "logps/rejected": -4.849751949310303, + "loss": 0.627, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.19803524017334, + "rewards/margins": 0.6517167687416077, + "rewards/rejected": -4.849751949310303, + "sft_loss": 4.22841739654541, + "step": 1715 + }, + { + "epoch": 0.9205552768021408, + "grad_norm": 14.685549172591013, + "learning_rate": 8.752388044972811e-07, + "logits/chosen": -0.08108510076999664, + "logits/rejected": -0.023788919672369957, + "logps/chosen": -4.0276780128479, + "logps/rejected": -4.748166084289551, + "loss": 0.5887, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.0276780128479, + "rewards/margins": 0.7204879522323608, + "rewards/rejected": -4.748166084289551, + "sft_loss": 4.157613754272461, + "step": 1720 + }, + { + "epoch": 0.9232313095835424, + "grad_norm": 12.536529497993769, + "learning_rate": 8.74207704880141e-07, + "logits/chosen": -0.09019587934017181, + "logits/rejected": -0.009882120415568352, + "logps/chosen": -4.088213920593262, + "logps/rejected": -4.951138019561768, + "loss": 0.5387, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.088213920593262, + "rewards/margins": 0.8629242777824402, + "rewards/rejected": -4.951138019561768, + "sft_loss": 4.194894790649414, + "step": 1725 + }, + { + "epoch": 0.925907342364944, + "grad_norm": 11.991754829936772, + "learning_rate": 8.731729746982068e-07, + "logits/chosen": -0.048121221363544464, + "logits/rejected": 0.014382058754563332, + "logps/chosen": -3.8855576515197754, + "logps/rejected": -4.534394264221191, + "loss": 0.5889, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.8855576515197754, + "rewards/margins": 0.6488367319107056, + "rewards/rejected": -4.534394264221191, + "sft_loss": 4.058175563812256, + "step": 1730 + }, + { + "epoch": 0.9285833751463456, + "grad_norm": 15.905985513002536, + "learning_rate": 8.721346239904355e-07, + "logits/chosen": -0.15635357797145844, + "logits/rejected": -0.009334458038210869, + "logps/chosen": -4.011080741882324, + "logps/rejected": -4.760368824005127, + "loss": 0.6137, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.011080741882324, + "rewards/margins": 0.7492876052856445, + "rewards/rejected": -4.760368824005127, + "sft_loss": 4.057827949523926, + "step": 1735 + }, + { + "epoch": 0.9312594079277471, + "grad_norm": 13.754476655611992, + "learning_rate": 8.710926628309101e-07, + "logits/chosen": -0.07682603597640991, + "logits/rejected": 0.05229213088750839, + "logps/chosen": -3.996446132659912, + "logps/rejected": -4.582263946533203, + "loss": 0.6005, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.996446132659912, + "rewards/margins": 0.5858179330825806, + "rewards/rejected": -4.582263946533203, + "sft_loss": 4.014688491821289, + "step": 1740 + }, + { + "epoch": 0.9339354407091487, + "grad_norm": 10.067335179362507, + "learning_rate": 8.700471013287424e-07, + "logits/chosen": -0.07593150436878204, + "logits/rejected": -0.04116532951593399, + "logps/chosen": -3.863304853439331, + "logps/rejected": -4.45562219619751, + "loss": 0.5992, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.863304853439331, + "rewards/margins": 0.5923171639442444, + "rewards/rejected": -4.45562219619751, + "sft_loss": 3.9426918029785156, + "step": 1745 + }, + { + "epoch": 0.9366114734905503, + "grad_norm": 18.353916912030076, + "learning_rate": 8.689979496279746e-07, + "logits/chosen": -0.08506940305233002, + "logits/rejected": -0.035322219133377075, + "logps/chosen": -4.098275184631348, + "logps/rejected": -4.524683952331543, + "loss": 0.699, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.098275184631348, + "rewards/margins": 0.42640891671180725, + "rewards/rejected": -4.524683952331543, + "sft_loss": 4.1536760330200195, + "step": 1750 + }, + { + "epoch": 0.9392875062719518, + "grad_norm": 13.388975189982899, + "learning_rate": 8.679452179074811e-07, + "logits/chosen": -0.10464473068714142, + "logits/rejected": -0.030910471454262733, + "logps/chosen": -3.925689697265625, + "logps/rejected": -4.58644962310791, + "loss": 0.5593, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.925689697265625, + "rewards/margins": 0.6607602834701538, + "rewards/rejected": -4.58644962310791, + "sft_loss": 4.019021987915039, + "step": 1755 + }, + { + "epoch": 0.9419635390533534, + "grad_norm": 13.16293333208723, + "learning_rate": 8.668889163808698e-07, + "logits/chosen": -0.08910728991031647, + "logits/rejected": -0.0004989765584468842, + "logps/chosen": -3.8630428314208984, + "logps/rejected": -4.422264575958252, + "loss": 0.6096, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.8630428314208984, + "rewards/margins": 0.5592218637466431, + "rewards/rejected": -4.422264575958252, + "sft_loss": 3.9234790802001953, + "step": 1760 + }, + { + "epoch": 0.944639571834755, + "grad_norm": 14.103884346044767, + "learning_rate": 8.658290552963827e-07, + "logits/chosen": -0.03781446814537048, + "logits/rejected": -0.0019023215863853693, + "logps/chosen": -3.9329593181610107, + "logps/rejected": -4.5812506675720215, + "loss": 0.635, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.9329593181610107, + "rewards/margins": 0.6482918858528137, + "rewards/rejected": -4.5812506675720215, + "sft_loss": 3.982529401779175, + "step": 1765 + }, + { + "epoch": 0.9473156046161565, + "grad_norm": 10.813112953658766, + "learning_rate": 8.647656449367966e-07, + "logits/chosen": -0.030758926644921303, + "logits/rejected": 0.08899354934692383, + "logps/chosen": -3.8885810375213623, + "logps/rejected": -4.4750494956970215, + "loss": 0.6233, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.8885810375213623, + "rewards/margins": 0.5864687561988831, + "rewards/rejected": -4.4750494956970215, + "sft_loss": 3.9688422679901123, + "step": 1770 + }, + { + "epoch": 0.9499916373975581, + "grad_norm": 11.772991974358334, + "learning_rate": 8.636986956193235e-07, + "logits/chosen": -0.11877866089344025, + "logits/rejected": -0.031002437695860863, + "logps/chosen": -3.7342638969421387, + "logps/rejected": -4.362320899963379, + "loss": 0.6135, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.7342638969421387, + "rewards/margins": 0.6280564069747925, + "rewards/rejected": -4.362320899963379, + "sft_loss": 3.768057346343994, + "step": 1775 + }, + { + "epoch": 0.9526676701789597, + "grad_norm": 12.658512953340415, + "learning_rate": 8.626282176955104e-07, + "logits/chosen": -0.05375178903341293, + "logits/rejected": 0.04362834244966507, + "logps/chosen": -3.6902244091033936, + "logps/rejected": -4.343583106994629, + "loss": 0.5861, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.6902244091033936, + "rewards/margins": 0.6533581018447876, + "rewards/rejected": -4.343583106994629, + "sft_loss": 3.7296364307403564, + "step": 1780 + }, + { + "epoch": 0.9553437029603613, + "grad_norm": 14.179960732016779, + "learning_rate": 8.615542215511389e-07, + "logits/chosen": -0.037360578775405884, + "logits/rejected": 0.00016373097605537623, + "logps/chosen": -3.7651381492614746, + "logps/rejected": -4.201287746429443, + "loss": 0.6528, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.7651381492614746, + "rewards/margins": 0.436149537563324, + "rewards/rejected": -4.201287746429443, + "sft_loss": 3.768385648727417, + "step": 1785 + }, + { + "epoch": 0.9580197357417628, + "grad_norm": 16.424368962312393, + "learning_rate": 8.604767176061241e-07, + "logits/chosen": -0.03774772211909294, + "logits/rejected": 0.029529806226491928, + "logps/chosen": -3.819180727005005, + "logps/rejected": -4.342913627624512, + "loss": 0.624, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.819180727005005, + "rewards/margins": 0.5237328410148621, + "rewards/rejected": -4.342913627624512, + "sft_loss": 3.8550522327423096, + "step": 1790 + }, + { + "epoch": 0.9606957685231644, + "grad_norm": 10.137170437138185, + "learning_rate": 8.593957163144141e-07, + "logits/chosen": -0.11584246158599854, + "logits/rejected": -0.008330103941261768, + "logps/chosen": -3.6586318016052246, + "logps/rejected": -4.388864994049072, + "loss": 0.5723, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.6586318016052246, + "rewards/margins": 0.7302330732345581, + "rewards/rejected": -4.388864994049072, + "sft_loss": 3.8012499809265137, + "step": 1795 + }, + { + "epoch": 0.963371801304566, + "grad_norm": 11.336523700086135, + "learning_rate": 8.58311228163888e-07, + "logits/chosen": -0.06548066437244415, + "logits/rejected": 0.008463447913527489, + "logps/chosen": -3.7641005516052246, + "logps/rejected": -4.285780429840088, + "loss": 0.618, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.7641005516052246, + "rewards/margins": 0.5216798782348633, + "rewards/rejected": -4.285780429840088, + "sft_loss": 3.7976505756378174, + "step": 1800 + }, + { + "epoch": 0.9660478340859675, + "grad_norm": 17.348790285283606, + "learning_rate": 8.57223263676255e-07, + "logits/chosen": -0.1233416348695755, + "logits/rejected": -0.029293501749634743, + "logps/chosen": -3.632089138031006, + "logps/rejected": -4.459404945373535, + "loss": 0.5263, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.632089138031006, + "rewards/margins": 0.8273167610168457, + "rewards/rejected": -4.459404945373535, + "sft_loss": 3.6921048164367676, + "step": 1805 + }, + { + "epoch": 0.9687238668673691, + "grad_norm": 11.301010273536113, + "learning_rate": 8.561318334069511e-07, + "logits/chosen": -0.07567024976015091, + "logits/rejected": 0.0327785387635231, + "logps/chosen": -3.817183256149292, + "logps/rejected": -4.456064701080322, + "loss": 0.5807, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.817183256149292, + "rewards/margins": 0.6388810873031616, + "rewards/rejected": -4.456064701080322, + "sft_loss": 3.8272976875305176, + "step": 1810 + }, + { + "epoch": 0.9713998996487707, + "grad_norm": 12.81936248461788, + "learning_rate": 8.550369479450375e-07, + "logits/chosen": -0.07301422208547592, + "logits/rejected": 0.004421988967806101, + "logps/chosen": -3.991558790206909, + "logps/rejected": -4.765776634216309, + "loss": 0.5697, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.991558790206909, + "rewards/margins": 0.7742177248001099, + "rewards/rejected": -4.765776634216309, + "sft_loss": 4.0272016525268555, + "step": 1815 + }, + { + "epoch": 0.9740759324301723, + "grad_norm": 17.547856262711125, + "learning_rate": 8.539386179130977e-07, + "logits/chosen": -0.06253395229578018, + "logits/rejected": -0.03671478480100632, + "logps/chosen": -4.0807695388793945, + "logps/rejected": -4.742793560028076, + "loss": 0.5886, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.0807695388793945, + "rewards/margins": 0.6620240211486816, + "rewards/rejected": -4.742793560028076, + "sft_loss": 4.02866268157959, + "step": 1820 + }, + { + "epoch": 0.9767519652115738, + "grad_norm": 13.651133484117285, + "learning_rate": 8.528368539671347e-07, + "logits/chosen": -0.1057223305106163, + "logits/rejected": -0.010374218225479126, + "logps/chosen": -3.9620888233184814, + "logps/rejected": -4.94193172454834, + "loss": 0.5389, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.9620888233184814, + "rewards/margins": 0.9798428416252136, + "rewards/rejected": -4.94193172454834, + "sft_loss": 4.024913787841797, + "step": 1825 + }, + { + "epoch": 0.9794279979929754, + "grad_norm": 14.638181690700069, + "learning_rate": 8.51731666796467e-07, + "logits/chosen": -0.07686323672533035, + "logits/rejected": -0.020837152376770973, + "logps/chosen": -4.231797218322754, + "logps/rejected": -4.9157586097717285, + "loss": 0.6144, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.231797218322754, + "rewards/margins": 0.683961033821106, + "rewards/rejected": -4.9157586097717285, + "sft_loss": 4.229867935180664, + "step": 1830 + }, + { + "epoch": 0.982104030774377, + "grad_norm": 14.737203631162844, + "learning_rate": 8.506230671236254e-07, + "logits/chosen": -0.116435207426548, + "logits/rejected": -0.06153398007154465, + "logps/chosen": -4.225415229797363, + "logps/rejected": -4.778110504150391, + "loss": 0.6326, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.225415229797363, + "rewards/margins": 0.5526946783065796, + "rewards/rejected": -4.778110504150391, + "sft_loss": 4.249210834503174, + "step": 1835 + }, + { + "epoch": 0.9847800635557785, + "grad_norm": 12.484086685506309, + "learning_rate": 8.495110657042488e-07, + "logits/chosen": -0.07110501825809479, + "logits/rejected": 0.061887145042419434, + "logps/chosen": -4.253946781158447, + "logps/rejected": -5.050871849060059, + "loss": 0.5617, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.253946781158447, + "rewards/margins": 0.7969244122505188, + "rewards/rejected": -5.050871849060059, + "sft_loss": 4.325262069702148, + "step": 1840 + }, + { + "epoch": 0.9874560963371801, + "grad_norm": 15.068150123718786, + "learning_rate": 8.483956733269799e-07, + "logits/chosen": -0.10341789573431015, + "logits/rejected": -0.020474877208471298, + "logps/chosen": -4.231016635894775, + "logps/rejected": -4.958784103393555, + "loss": 0.5979, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.231016635894775, + "rewards/margins": 0.7277677655220032, + "rewards/rejected": -4.958784103393555, + "sft_loss": 4.313578128814697, + "step": 1845 + }, + { + "epoch": 0.9901321291185817, + "grad_norm": 21.8315650041553, + "learning_rate": 8.472769008133602e-07, + "logits/chosen": -0.20346875488758087, + "logits/rejected": -0.09715770184993744, + "logps/chosen": -4.41485071182251, + "logps/rejected": -5.1168389320373535, + "loss": 0.6072, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.41485071182251, + "rewards/margins": 0.7019883394241333, + "rewards/rejected": -5.1168389320373535, + "sft_loss": 4.39910888671875, + "step": 1850 + }, + { + "epoch": 0.9928081618999832, + "grad_norm": 20.59256668239971, + "learning_rate": 8.461547590177259e-07, + "logits/chosen": -0.09526954591274261, + "logits/rejected": 0.004205456469208002, + "logps/chosen": -4.31170654296875, + "logps/rejected": -5.111785888671875, + "loss": 0.641, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.31170654296875, + "rewards/margins": 0.8000794649124146, + "rewards/rejected": -5.111785888671875, + "sft_loss": 4.410150527954102, + "step": 1855 + }, + { + "epoch": 0.9954841946813848, + "grad_norm": 15.729252198391519, + "learning_rate": 8.450292588271014e-07, + "logits/chosen": -0.09368324279785156, + "logits/rejected": -0.002443189499899745, + "logps/chosen": -4.455938816070557, + "logps/rejected": -5.167759895324707, + "loss": 0.5937, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.455938816070557, + "rewards/margins": 0.7118209600448608, + "rewards/rejected": -5.167759895324707, + "sft_loss": 4.464524269104004, + "step": 1860 + }, + { + "epoch": 0.9981602274627864, + "grad_norm": 19.667161127776616, + "learning_rate": 8.439004111610945e-07, + "logits/chosen": -0.08779771625995636, + "logits/rejected": -0.034873414784669876, + "logps/chosen": -4.279250144958496, + "logps/rejected": -5.026732444763184, + "loss": 0.604, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.279250144958496, + "rewards/margins": 0.747482419013977, + "rewards/rejected": -5.026732444763184, + "sft_loss": 4.387712001800537, + "step": 1865 + }, + { + "epoch": 1.000836260244188, + "grad_norm": 14.558493175272398, + "learning_rate": 8.427682269717901e-07, + "logits/chosen": -0.12827758491039276, + "logits/rejected": -0.027052491903305054, + "logps/chosen": -4.180931091308594, + "logps/rejected": -4.950901031494141, + "loss": 0.5554, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.180931091308594, + "rewards/margins": 0.7699697017669678, + "rewards/rejected": -4.950901031494141, + "sft_loss": 4.20839786529541, + "step": 1870 + }, + { + "epoch": 1.0035122930255895, + "grad_norm": 18.27663410760242, + "learning_rate": 8.416327172436446e-07, + "logits/chosen": -0.17196258902549744, + "logits/rejected": -0.06464549154043198, + "logps/chosen": -4.138596534729004, + "logps/rejected": -4.672633171081543, + "loss": 0.6435, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.138596534729004, + "rewards/margins": 0.5340362787246704, + "rewards/rejected": -4.672633171081543, + "sft_loss": 4.115258693695068, + "step": 1875 + }, + { + "epoch": 1.0061883258069912, + "grad_norm": 12.116387685411857, + "learning_rate": 8.404938929933778e-07, + "logits/chosen": -0.055260974913835526, + "logits/rejected": 0.04977753013372421, + "logps/chosen": -3.918208360671997, + "logps/rejected": -4.867048740386963, + "loss": 0.5035, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.918208360671997, + "rewards/margins": 0.9488400220870972, + "rewards/rejected": -4.867048740386963, + "sft_loss": 3.9469428062438965, + "step": 1880 + }, + { + "epoch": 1.0088643585883927, + "grad_norm": 13.712602815240812, + "learning_rate": 8.39351765269868e-07, + "logits/chosen": -0.09575023502111435, + "logits/rejected": -0.02423889935016632, + "logps/chosen": -3.929525375366211, + "logps/rejected": -4.6153106689453125, + "loss": 0.6128, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.929525375366211, + "rewards/margins": 0.6857854723930359, + "rewards/rejected": -4.6153106689453125, + "sft_loss": 3.9949851036071777, + "step": 1885 + }, + { + "epoch": 1.0115403913697942, + "grad_norm": 15.553912642870277, + "learning_rate": 8.382063451540431e-07, + "logits/chosen": -0.13556435704231262, + "logits/rejected": 0.03311797231435776, + "logps/chosen": -3.945952892303467, + "logps/rejected": -4.7006120681762695, + "loss": 0.5602, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.945952892303467, + "rewards/margins": 0.7546585202217102, + "rewards/rejected": -4.7006120681762695, + "sft_loss": 4.093667030334473, + "step": 1890 + }, + { + "epoch": 1.014216424151196, + "grad_norm": 13.24083869530128, + "learning_rate": 8.370576437587742e-07, + "logits/chosen": -0.04542506858706474, + "logits/rejected": -0.024010324850678444, + "logps/chosen": -3.9064109325408936, + "logps/rejected": -4.599078178405762, + "loss": 0.562, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.9064109325408936, + "rewards/margins": 0.6926669478416443, + "rewards/rejected": -4.599078178405762, + "sft_loss": 3.8920929431915283, + "step": 1895 + }, + { + "epoch": 1.0168924569325974, + "grad_norm": 12.759117673717622, + "learning_rate": 8.359056722287674e-07, + "logits/chosen": -0.15118056535720825, + "logits/rejected": 0.05147156864404678, + "logps/chosen": -4.047581195831299, + "logps/rejected": -4.806788444519043, + "loss": 0.5797, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.047581195831299, + "rewards/margins": 0.7592069506645203, + "rewards/rejected": -4.806788444519043, + "sft_loss": 4.174335956573486, + "step": 1900 + }, + { + "epoch": 1.019568489713999, + "grad_norm": 13.883968917337462, + "learning_rate": 8.347504417404553e-07, + "logits/chosen": -0.10345200449228287, + "logits/rejected": 0.0034005953930318356, + "logps/chosen": -4.144354820251465, + "logps/rejected": -4.889418125152588, + "loss": 0.5943, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.144354820251465, + "rewards/margins": 0.7450634837150574, + "rewards/rejected": -4.889418125152588, + "sft_loss": 4.188509941101074, + "step": 1905 + }, + { + "epoch": 1.0222445224954007, + "grad_norm": 10.731596419107385, + "learning_rate": 8.335919635018893e-07, + "logits/chosen": -0.20840208232402802, + "logits/rejected": -0.11927944421768188, + "logps/chosen": -4.063103675842285, + "logps/rejected": -4.731971263885498, + "loss": 0.5719, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.063103675842285, + "rewards/margins": 0.6688679456710815, + "rewards/rejected": -4.731971263885498, + "sft_loss": 4.155635356903076, + "step": 1910 + }, + { + "epoch": 1.0249205552768021, + "grad_norm": 10.963254128019047, + "learning_rate": 8.324302487526303e-07, + "logits/chosen": -0.18590185046195984, + "logits/rejected": -0.08720795810222626, + "logps/chosen": -4.1862921714782715, + "logps/rejected": -4.918190956115723, + "loss": 0.5674, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.1862921714782715, + "rewards/margins": 0.7318993806838989, + "rewards/rejected": -4.918190956115723, + "sft_loss": 4.248475074768066, + "step": 1915 + }, + { + "epoch": 1.0275965880582036, + "grad_norm": 11.916215877231483, + "learning_rate": 8.312653087636398e-07, + "logits/chosen": -0.1482846736907959, + "logits/rejected": -0.08977895975112915, + "logps/chosen": -4.116065502166748, + "logps/rejected": -4.904284477233887, + "loss": 0.5654, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.116065502166748, + "rewards/margins": 0.7882182002067566, + "rewards/rejected": -4.904284477233887, + "sft_loss": 4.303043842315674, + "step": 1920 + }, + { + "epoch": 1.0302726208396054, + "grad_norm": 18.505577200312903, + "learning_rate": 8.300971548371711e-07, + "logits/chosen": -0.2550107538700104, + "logits/rejected": -0.09487710893154144, + "logps/chosen": -4.230193138122559, + "logps/rejected": -4.9468584060668945, + "loss": 0.5762, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.230193138122559, + "rewards/margins": 0.7166647911071777, + "rewards/rejected": -4.9468584060668945, + "sft_loss": 4.287786960601807, + "step": 1925 + }, + { + "epoch": 1.0329486536210069, + "grad_norm": 15.467327289614014, + "learning_rate": 8.289257983066582e-07, + "logits/chosen": -0.2060461938381195, + "logits/rejected": -0.09747409075498581, + "logps/chosen": -4.046446800231934, + "logps/rejected": -4.801242828369141, + "loss": 0.5648, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.046446800231934, + "rewards/margins": 0.7547962069511414, + "rewards/rejected": -4.801242828369141, + "sft_loss": 4.178564548492432, + "step": 1930 + }, + { + "epoch": 1.0356246864024083, + "grad_norm": 14.620059094646697, + "learning_rate": 8.277512505366077e-07, + "logits/chosen": -0.23246657848358154, + "logits/rejected": -0.07202459126710892, + "logps/chosen": -4.032923698425293, + "logps/rejected": -4.800480842590332, + "loss": 0.5747, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.032923698425293, + "rewards/margins": 0.7675572633743286, + "rewards/rejected": -4.800480842590332, + "sft_loss": 4.033566951751709, + "step": 1935 + }, + { + "epoch": 1.03830071918381, + "grad_norm": 15.895269581813086, + "learning_rate": 8.265735229224868e-07, + "logits/chosen": -0.16594497859477997, + "logits/rejected": -0.07534348219633102, + "logps/chosen": -3.938028335571289, + "logps/rejected": -4.824290752410889, + "loss": 0.5395, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.938028335571289, + "rewards/margins": 0.8862627744674683, + "rewards/rejected": -4.824290752410889, + "sft_loss": 3.8248863220214844, + "step": 1940 + }, + { + "epoch": 1.0409767519652116, + "grad_norm": 11.005947997620881, + "learning_rate": 8.253926268906144e-07, + "logits/chosen": -0.2246125191450119, + "logits/rejected": -0.10327408462762833, + "logps/chosen": -3.947735548019409, + "logps/rejected": -4.890450477600098, + "loss": 0.5051, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.947735548019409, + "rewards/margins": 0.942714512348175, + "rewards/rejected": -4.890450477600098, + "sft_loss": 3.9465606212615967, + "step": 1945 + }, + { + "epoch": 1.043652784746613, + "grad_norm": 13.930596343210329, + "learning_rate": 8.242085738980487e-07, + "logits/chosen": -0.1279004067182541, + "logits/rejected": 0.03352883830666542, + "logps/chosen": -4.045596122741699, + "logps/rejected": -4.859705924987793, + "loss": 0.5664, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.045596122741699, + "rewards/margins": 0.814110279083252, + "rewards/rejected": -4.859705924987793, + "sft_loss": 4.0374040603637695, + "step": 1950 + }, + { + "epoch": 1.0463288175280148, + "grad_norm": 16.839294295183258, + "learning_rate": 8.230213754324772e-07, + "logits/chosen": -0.16892042756080627, + "logits/rejected": -0.10787680000066757, + "logps/chosen": -4.0660200119018555, + "logps/rejected": -4.787367820739746, + "loss": 0.5484, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.0660200119018555, + "rewards/margins": 0.7213469743728638, + "rewards/rejected": -4.787367820739746, + "sft_loss": 4.065528869628906, + "step": 1955 + }, + { + "epoch": 1.0490048503094163, + "grad_norm": 17.413554977238082, + "learning_rate": 8.218310430121045e-07, + "logits/chosen": -0.1649189293384552, + "logits/rejected": -0.13256001472473145, + "logps/chosen": -4.049285411834717, + "logps/rejected": -4.7529191970825195, + "loss": 0.5938, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.049285411834717, + "rewards/margins": 0.7036335468292236, + "rewards/rejected": -4.7529191970825195, + "sft_loss": 4.0807366371154785, + "step": 1960 + }, + { + "epoch": 1.051680883090818, + "grad_norm": 13.4636615396051, + "learning_rate": 8.20637588185541e-07, + "logits/chosen": -0.12577806413173676, + "logits/rejected": -0.05988551303744316, + "logps/chosen": -4.122179985046387, + "logps/rejected": -5.196074485778809, + "loss": 0.4992, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.122179985046387, + "rewards/margins": 1.0738941431045532, + "rewards/rejected": -5.196074485778809, + "sft_loss": 4.1924967765808105, + "step": 1965 + }, + { + "epoch": 1.0543569158722195, + "grad_norm": 14.840492992172235, + "learning_rate": 8.194410225316906e-07, + "logits/chosen": -0.18502992391586304, + "logits/rejected": -0.07532224804162979, + "logps/chosen": -4.094423770904541, + "logps/rejected": -4.795498847961426, + "loss": 0.5868, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.094423770904541, + "rewards/margins": 0.7010759115219116, + "rewards/rejected": -4.795498847961426, + "sft_loss": 4.141739845275879, + "step": 1970 + }, + { + "epoch": 1.057032948653621, + "grad_norm": 14.625104911300314, + "learning_rate": 8.182413576596385e-07, + "logits/chosen": -0.027142172679305077, + "logits/rejected": 0.014019886963069439, + "logps/chosen": -4.086574554443359, + "logps/rejected": -4.903563499450684, + "loss": 0.5701, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.086574554443359, + "rewards/margins": 0.8169891238212585, + "rewards/rejected": -4.903563499450684, + "sft_loss": 4.168776035308838, + "step": 1975 + }, + { + "epoch": 1.0597089814350227, + "grad_norm": 17.926558844975595, + "learning_rate": 8.170386052085389e-07, + "logits/chosen": -0.07390134036540985, + "logits/rejected": 0.005565158557146788, + "logps/chosen": -4.23164176940918, + "logps/rejected": -5.028608322143555, + "loss": 0.5973, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.23164176940918, + "rewards/margins": 0.796966016292572, + "rewards/rejected": -5.028608322143555, + "sft_loss": 4.260631084442139, + "step": 1980 + }, + { + "epoch": 1.0623850142164242, + "grad_norm": 15.960753170538107, + "learning_rate": 8.158327768475008e-07, + "logits/chosen": -0.10266276448965073, + "logits/rejected": 0.004123342223465443, + "logps/chosen": -4.1737165451049805, + "logps/rejected": -4.8127055168151855, + "loss": 0.6381, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.1737165451049805, + "rewards/margins": 0.6389890909194946, + "rewards/rejected": -4.8127055168151855, + "sft_loss": 4.173183441162109, + "step": 1985 + }, + { + "epoch": 1.0650610469978257, + "grad_norm": 19.160626139055477, + "learning_rate": 8.146238842754767e-07, + "logits/chosen": -0.15659977495670319, + "logits/rejected": -0.07845531404018402, + "logps/chosen": -4.23665714263916, + "logps/rejected": -4.884160041809082, + "loss": 0.6013, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.23665714263916, + "rewards/margins": 0.6475027799606323, + "rewards/rejected": -4.884160041809082, + "sft_loss": 4.255043983459473, + "step": 1990 + }, + { + "epoch": 1.0677370797792274, + "grad_norm": 21.29535366361702, + "learning_rate": 8.134119392211476e-07, + "logits/chosen": -0.08475625514984131, + "logits/rejected": 0.04252525418996811, + "logps/chosen": -4.185021877288818, + "logps/rejected": -5.140681266784668, + "loss": 0.5477, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.185021877288818, + "rewards/margins": 0.9556596875190735, + "rewards/rejected": -5.140681266784668, + "sft_loss": 4.271013259887695, + "step": 1995 + }, + { + "epoch": 1.0704131125606289, + "grad_norm": 18.12803368124785, + "learning_rate": 8.121969534428094e-07, + "logits/chosen": -0.19395044445991516, + "logits/rejected": -0.06716791540384293, + "logps/chosen": -4.197428226470947, + "logps/rejected": -4.901777744293213, + "loss": 0.6187, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.197428226470947, + "rewards/margins": 0.7043498158454895, + "rewards/rejected": -4.901777744293213, + "sft_loss": 4.247593402862549, + "step": 2000 + }, + { + "epoch": 1.0704131125606289, + "eval_logits/chosen": 0.0695294588804245, + "eval_logits/rejected": 0.14971204102039337, + "eval_logps/chosen": -4.1031975746154785, + "eval_logps/rejected": -4.878896236419678, + "eval_loss": 0.5857792496681213, + "eval_rewards/accuracies": 0.715133547782898, + "eval_rewards/chosen": -4.1031975746154785, + "eval_rewards/margins": 0.7756983637809753, + "eval_rewards/rejected": -4.878896236419678, + "eval_runtime": 42.9964, + "eval_samples_per_second": 31.282, + "eval_sft_loss": 4.131138801574707, + "eval_steps_per_second": 7.838, + "step": 2000 + }, + { + "epoch": 1.0730891453420304, + "grad_norm": 17.05338147800005, + "learning_rate": 8.109789387282599e-07, + "logits/chosen": -0.14190088212490082, + "logits/rejected": -0.09127983450889587, + "logps/chosen": -4.014540672302246, + "logps/rejected": -4.691771507263184, + "loss": 0.6169, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.014540672302246, + "rewards/margins": 0.6772304177284241, + "rewards/rejected": -4.691771507263184, + "sft_loss": 4.064333438873291, + "step": 2005 + }, + { + "epoch": 1.075765178123432, + "grad_norm": 19.87084053942476, + "learning_rate": 8.097579068946827e-07, + "logits/chosen": -0.11133239418268204, + "logits/rejected": -0.011872517876327038, + "logps/chosen": -3.922724485397339, + "logps/rejected": -4.658257961273193, + "loss": 0.5662, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.922724485397339, + "rewards/margins": 0.7355332970619202, + "rewards/rejected": -4.658257961273193, + "sft_loss": 3.9572842121124268, + "step": 2010 + }, + { + "epoch": 1.0784412109048336, + "grad_norm": 15.774688448078022, + "learning_rate": 8.085338697885344e-07, + "logits/chosen": -0.09978102147579193, + "logits/rejected": 0.0001321844756603241, + "logps/chosen": -3.9842612743377686, + "logps/rejected": -4.7468109130859375, + "loss": 0.5523, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.9842612743377686, + "rewards/margins": 0.7625495791435242, + "rewards/rejected": -4.7468109130859375, + "sft_loss": 3.9912009239196777, + "step": 2015 + }, + { + "epoch": 1.081117243686235, + "grad_norm": 14.969559946626333, + "learning_rate": 8.073068392854282e-07, + "logits/chosen": -0.20114025473594666, + "logits/rejected": -0.046921707689762115, + "logps/chosen": -4.055922508239746, + "logps/rejected": -4.940064430236816, + "loss": 0.5027, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.055922508239746, + "rewards/margins": 0.884142279624939, + "rewards/rejected": -4.940064430236816, + "sft_loss": 4.046994686126709, + "step": 2020 + }, + { + "epoch": 1.0837932764676368, + "grad_norm": 13.207095475491212, + "learning_rate": 8.060768272900193e-07, + "logits/chosen": -0.0987638384103775, + "logits/rejected": 0.012016674503684044, + "logps/chosen": -4.053750038146973, + "logps/rejected": -4.846047401428223, + "loss": 0.5785, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.053750038146973, + "rewards/margins": 0.7922976613044739, + "rewards/rejected": -4.846047401428223, + "sft_loss": 4.142302513122559, + "step": 2025 + }, + { + "epoch": 1.0864693092490383, + "grad_norm": 11.154188596460584, + "learning_rate": 8.0484384573589e-07, + "logits/chosen": -0.19130758941173553, + "logits/rejected": -0.14506961405277252, + "logps/chosen": -3.9372031688690186, + "logps/rejected": -4.668862342834473, + "loss": 0.5785, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.9372031688690186, + "rewards/margins": 0.7316591143608093, + "rewards/rejected": -4.668862342834473, + "sft_loss": 3.939913511276245, + "step": 2030 + }, + { + "epoch": 1.0891453420304398, + "grad_norm": 17.533100647290997, + "learning_rate": 8.03607906585432e-07, + "logits/chosen": -0.16748812794685364, + "logits/rejected": -0.051542095839977264, + "logps/chosen": -4.124471187591553, + "logps/rejected": -4.820754051208496, + "loss": 0.6085, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.124471187591553, + "rewards/margins": 0.6962825059890747, + "rewards/rejected": -4.820754051208496, + "sft_loss": 4.2142438888549805, + "step": 2035 + }, + { + "epoch": 1.0918213748118415, + "grad_norm": 26.039351885496657, + "learning_rate": 8.023690218297329e-07, + "logits/chosen": -0.24009211361408234, + "logits/rejected": -0.18795573711395264, + "logps/chosen": -4.091917514801025, + "logps/rejected": -4.927672386169434, + "loss": 0.5537, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.091917514801025, + "rewards/margins": 0.8357549905776978, + "rewards/rejected": -4.927672386169434, + "sft_loss": 4.069763660430908, + "step": 2040 + }, + { + "epoch": 1.094497407593243, + "grad_norm": 18.31897348268519, + "learning_rate": 8.01127203488458e-07, + "logits/chosen": -0.155735582113266, + "logits/rejected": -0.10973642021417618, + "logps/chosen": -4.329349040985107, + "logps/rejected": -5.011044025421143, + "loss": 0.589, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.329349040985107, + "rewards/margins": 0.6816949844360352, + "rewards/rejected": -5.011044025421143, + "sft_loss": 4.351229667663574, + "step": 2045 + }, + { + "epoch": 1.0971734403746445, + "grad_norm": 20.184438837605725, + "learning_rate": 7.998824636097339e-07, + "logits/chosen": -0.18872496485710144, + "logits/rejected": -0.08053840696811676, + "logps/chosen": -4.147758483886719, + "logps/rejected": -4.978602409362793, + "loss": 0.5597, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.147758483886719, + "rewards/margins": 0.8308441042900085, + "rewards/rejected": -4.978602409362793, + "sft_loss": 4.277966499328613, + "step": 2050 + }, + { + "epoch": 1.0998494731560462, + "grad_norm": 20.94320682853988, + "learning_rate": 7.986348142700328e-07, + "logits/chosen": -0.15364764630794525, + "logits/rejected": -0.024008695036172867, + "logps/chosen": -4.2366251945495605, + "logps/rejected": -5.171116352081299, + "loss": 0.5549, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.2366251945495605, + "rewards/margins": 0.934490978717804, + "rewards/rejected": -5.171116352081299, + "sft_loss": 4.434886932373047, + "step": 2055 + }, + { + "epoch": 1.1025255059374477, + "grad_norm": 22.250093128795292, + "learning_rate": 7.973842675740539e-07, + "logits/chosen": -0.14576201140880585, + "logits/rejected": -0.08288852870464325, + "logps/chosen": -4.201880931854248, + "logps/rejected": -5.092720985412598, + "loss": 0.5352, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.201880931854248, + "rewards/margins": 0.8908407092094421, + "rewards/rejected": -5.092720985412598, + "sft_loss": 4.3396759033203125, + "step": 2060 + }, + { + "epoch": 1.1052015387188494, + "grad_norm": 21.242304741827414, + "learning_rate": 7.961308356546066e-07, + "logits/chosen": -0.18846619129180908, + "logits/rejected": -0.06018044799566269, + "logps/chosen": -4.2880449295043945, + "logps/rejected": -5.2668890953063965, + "loss": 0.5225, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.2880449295043945, + "rewards/margins": 0.9788439869880676, + "rewards/rejected": -5.2668890953063965, + "sft_loss": 4.337084770202637, + "step": 2065 + }, + { + "epoch": 1.107877571500251, + "grad_norm": 19.31570145078924, + "learning_rate": 7.948745306724931e-07, + "logits/chosen": -0.10848329961299896, + "logits/rejected": 0.021837735548615456, + "logps/chosen": -4.196038246154785, + "logps/rejected": -5.2210283279418945, + "loss": 0.4841, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.196038246154785, + "rewards/margins": 1.024989128112793, + "rewards/rejected": -5.2210283279418945, + "sft_loss": 4.274868965148926, + "step": 2070 + }, + { + "epoch": 1.1105536042816524, + "grad_norm": 22.94013542586667, + "learning_rate": 7.936153648163897e-07, + "logits/chosen": -0.18677277863025665, + "logits/rejected": -0.09163932502269745, + "logps/chosen": -4.378596782684326, + "logps/rejected": -5.164307117462158, + "loss": 0.5738, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.378596782684326, + "rewards/margins": 0.7857100963592529, + "rewards/rejected": -5.164307117462158, + "sft_loss": 4.5541205406188965, + "step": 2075 + }, + { + "epoch": 1.1132296370630541, + "grad_norm": 17.167234712434464, + "learning_rate": 7.92353350302729e-07, + "logits/chosen": -0.21692046523094177, + "logits/rejected": -0.08329122513532639, + "logps/chosen": -4.143950462341309, + "logps/rejected": -5.008641242980957, + "loss": 0.5503, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.143950462341309, + "rewards/margins": 0.8646903038024902, + "rewards/rejected": -5.008641242980957, + "sft_loss": 4.27891206741333, + "step": 2080 + }, + { + "epoch": 1.1159056698444556, + "grad_norm": 19.34770237312849, + "learning_rate": 7.910884993755816e-07, + "logits/chosen": -0.16237115859985352, + "logits/rejected": -0.08700194954872131, + "logps/chosen": -4.205402374267578, + "logps/rejected": -5.136702537536621, + "loss": 0.5444, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.205402374267578, + "rewards/margins": 0.9313004612922668, + "rewards/rejected": -5.136702537536621, + "sft_loss": 4.245094299316406, + "step": 2085 + }, + { + "epoch": 1.118581702625857, + "grad_norm": 17.22780023928489, + "learning_rate": 7.898208243065367e-07, + "logits/chosen": -0.23272278904914856, + "logits/rejected": -0.21367502212524414, + "logps/chosen": -4.102339744567871, + "logps/rejected": -4.745957374572754, + "loss": 0.5998, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.102339744567871, + "rewards/margins": 0.6436169743537903, + "rewards/rejected": -4.745957374572754, + "sft_loss": 4.200228691101074, + "step": 2090 + }, + { + "epoch": 1.1212577354072588, + "grad_norm": 17.10567437014088, + "learning_rate": 7.88550337394583e-07, + "logits/chosen": -0.22676539421081543, + "logits/rejected": -0.12468956410884857, + "logps/chosen": -4.255773067474365, + "logps/rejected": -4.978231430053711, + "loss": 0.5951, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.255773067474365, + "rewards/margins": 0.722458004951477, + "rewards/rejected": -4.978231430053711, + "sft_loss": 4.269195079803467, + "step": 2095 + }, + { + "epoch": 1.1239337681886603, + "grad_norm": 20.0998741552246, + "learning_rate": 7.872770509659905e-07, + "logits/chosen": -0.16979366540908813, + "logits/rejected": -0.14117419719696045, + "logps/chosen": -4.149965763092041, + "logps/rejected": -4.847676753997803, + "loss": 0.5978, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.149965763092041, + "rewards/margins": 0.6977112293243408, + "rewards/rejected": -4.847676753997803, + "sft_loss": 4.136035919189453, + "step": 2100 + }, + { + "epoch": 1.1266098009700618, + "grad_norm": 12.754923904543626, + "learning_rate": 7.860009773741896e-07, + "logits/chosen": -0.12392155826091766, + "logits/rejected": -0.01042869407683611, + "logps/chosen": -3.959754467010498, + "logps/rejected": -4.914425849914551, + "loss": 0.5009, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.959754467010498, + "rewards/margins": 0.9546712636947632, + "rewards/rejected": -4.914425849914551, + "sft_loss": 3.9639930725097656, + "step": 2105 + }, + { + "epoch": 1.1292858337514635, + "grad_norm": 17.556772490752582, + "learning_rate": 7.84722128999652e-07, + "logits/chosen": -0.1658501923084259, + "logits/rejected": -0.06590066850185394, + "logps/chosen": -4.020123481750488, + "logps/rejected": -4.954275131225586, + "loss": 0.5673, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.020123481750488, + "rewards/margins": 0.9341517686843872, + "rewards/rejected": -4.954275131225586, + "sft_loss": 4.090924263000488, + "step": 2110 + }, + { + "epoch": 1.131961866532865, + "grad_norm": 16.578758847117033, + "learning_rate": 7.834405182497699e-07, + "logits/chosen": -0.10159929096698761, + "logits/rejected": -0.06274596601724625, + "logps/chosen": -4.219315052032471, + "logps/rejected": -5.10249137878418, + "loss": 0.5599, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.219315052032471, + "rewards/margins": 0.883176326751709, + "rewards/rejected": -5.10249137878418, + "sft_loss": 4.273227691650391, + "step": 2115 + }, + { + "epoch": 1.1346378993142665, + "grad_norm": 16.2793904905083, + "learning_rate": 7.821561575587368e-07, + "logits/chosen": -0.19755463302135468, + "logits/rejected": -0.15835200250148773, + "logps/chosen": -4.155510425567627, + "logps/rejected": -4.86837100982666, + "loss": 0.5639, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.155510425567627, + "rewards/margins": 0.7128608822822571, + "rewards/rejected": -4.86837100982666, + "sft_loss": 4.283026695251465, + "step": 2120 + }, + { + "epoch": 1.1373139320956682, + "grad_norm": 14.055977009848089, + "learning_rate": 7.808690593874254e-07, + "logits/chosen": -0.18170380592346191, + "logits/rejected": -0.10592323541641235, + "logps/chosen": -4.295332431793213, + "logps/rejected": -5.149449348449707, + "loss": 0.5547, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.295332431793213, + "rewards/margins": 0.8541167974472046, + "rewards/rejected": -5.149449348449707, + "sft_loss": 4.373181343078613, + "step": 2125 + }, + { + "epoch": 1.1399899648770697, + "grad_norm": 17.30927241380946, + "learning_rate": 7.79579236223268e-07, + "logits/chosen": -0.11262079328298569, + "logits/rejected": 0.050410158932209015, + "logps/chosen": -4.156128883361816, + "logps/rejected": -5.0874128341674805, + "loss": 0.5383, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.156128883361816, + "rewards/margins": 0.9312840700149536, + "rewards/rejected": -5.0874128341674805, + "sft_loss": 4.226848602294922, + "step": 2130 + }, + { + "epoch": 1.1426659976584714, + "grad_norm": 19.103225819609648, + "learning_rate": 7.782867005801346e-07, + "logits/chosen": -0.14864888787269592, + "logits/rejected": 0.01239332277327776, + "logps/chosen": -3.9819626808166504, + "logps/rejected": -4.8957133293151855, + "loss": 0.5437, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.9819626808166504, + "rewards/margins": 0.9137503504753113, + "rewards/rejected": -4.8957133293151855, + "sft_loss": 3.9962246417999268, + "step": 2135 + }, + { + "epoch": 1.145342030439873, + "grad_norm": 22.11739512358915, + "learning_rate": 7.769914649982117e-07, + "logits/chosen": -0.12412190437316895, + "logits/rejected": -0.02263473905622959, + "logps/chosen": -3.9432873725891113, + "logps/rejected": -4.75213098526001, + "loss": 0.57, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.9432873725891113, + "rewards/margins": 0.8088437914848328, + "rewards/rejected": -4.75213098526001, + "sft_loss": 4.006190776824951, + "step": 2140 + }, + { + "epoch": 1.1480180632212744, + "grad_norm": 14.002264234409363, + "learning_rate": 7.756935420438803e-07, + "logits/chosen": -0.11869798600673676, + "logits/rejected": -0.03148897737264633, + "logps/chosen": -3.889836549758911, + "logps/rejected": -4.979212760925293, + "loss": 0.5115, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.889836549758911, + "rewards/margins": 1.08937668800354, + "rewards/rejected": -4.979212760925293, + "sft_loss": 3.909101963043213, + "step": 2145 + }, + { + "epoch": 1.1506940960026761, + "grad_norm": 15.202302167744714, + "learning_rate": 7.743929443095951e-07, + "logits/chosen": -0.15758922696113586, + "logits/rejected": -0.09596370160579681, + "logps/chosen": -4.030735015869141, + "logps/rejected": -4.90773344039917, + "loss": 0.5131, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.030735015869141, + "rewards/margins": 0.8769989013671875, + "rewards/rejected": -4.90773344039917, + "sft_loss": 3.9965927600860596, + "step": 2150 + }, + { + "epoch": 1.1533701287840776, + "grad_norm": 18.19913795403612, + "learning_rate": 7.730896844137609e-07, + "logits/chosen": -0.12692536413669586, + "logits/rejected": -0.056444037705659866, + "logps/chosen": -4.315699577331543, + "logps/rejected": -4.95998477935791, + "loss": 0.6302, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.315699577331543, + "rewards/margins": 0.6442848443984985, + "rewards/rejected": -4.95998477935791, + "sft_loss": 4.314980506896973, + "step": 2155 + }, + { + "epoch": 1.1560461615654791, + "grad_norm": 17.004668760738433, + "learning_rate": 7.717837750006106e-07, + "logits/chosen": -0.12733769416809082, + "logits/rejected": -0.060163237154483795, + "logps/chosen": -4.17328405380249, + "logps/rejected": -5.150243282318115, + "loss": 0.5492, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.17328405380249, + "rewards/margins": 0.976959228515625, + "rewards/rejected": -5.150243282318115, + "sft_loss": 4.155553817749023, + "step": 2160 + }, + { + "epoch": 1.1587221943468808, + "grad_norm": 18.08265422362301, + "learning_rate": 7.704752287400832e-07, + "logits/chosen": -0.1178988590836525, + "logits/rejected": 0.03894580155611038, + "logps/chosen": -4.1134514808654785, + "logps/rejected": -5.065721035003662, + "loss": 0.5513, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.1134514808654785, + "rewards/margins": 0.9522699117660522, + "rewards/rejected": -5.065721035003662, + "sft_loss": 4.160314083099365, + "step": 2165 + }, + { + "epoch": 1.1613982271282823, + "grad_norm": 10.545918570379506, + "learning_rate": 7.691640583277004e-07, + "logits/chosen": -0.09397809207439423, + "logits/rejected": 0.013970824889838696, + "logps/chosen": -4.077818870544434, + "logps/rejected": -5.0350213050842285, + "loss": 0.5459, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.077818870544434, + "rewards/margins": 0.9572019577026367, + "rewards/rejected": -5.0350213050842285, + "sft_loss": 4.102574825286865, + "step": 2170 + }, + { + "epoch": 1.1640742599096838, + "grad_norm": 13.748822446211006, + "learning_rate": 7.678502764844433e-07, + "logits/chosen": -0.18362076580524445, + "logits/rejected": -0.025226274505257607, + "logps/chosen": -4.079222679138184, + "logps/rejected": -4.883854389190674, + "loss": 0.562, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.079222679138184, + "rewards/margins": 0.8046321868896484, + "rewards/rejected": -4.883854389190674, + "sft_loss": 4.105748176574707, + "step": 2175 + }, + { + "epoch": 1.1667502926910855, + "grad_norm": 17.97959176978472, + "learning_rate": 7.665338959566288e-07, + "logits/chosen": -0.13779087364673615, + "logits/rejected": -0.06253369152545929, + "logps/chosen": -4.060021877288818, + "logps/rejected": -5.039783954620361, + "loss": 0.5132, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.060021877288818, + "rewards/margins": 0.9797617793083191, + "rewards/rejected": -5.039783954620361, + "sft_loss": 4.161438941955566, + "step": 2180 + }, + { + "epoch": 1.169426325472487, + "grad_norm": 17.599251235883298, + "learning_rate": 7.652149295157868e-07, + "logits/chosen": -0.06856133043766022, + "logits/rejected": 0.05627395957708359, + "logps/chosen": -4.204378604888916, + "logps/rejected": -4.9699811935424805, + "loss": 0.5685, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.204378604888916, + "rewards/margins": 0.7656024694442749, + "rewards/rejected": -4.9699811935424805, + "sft_loss": 4.238471031188965, + "step": 2185 + }, + { + "epoch": 1.1721023582538885, + "grad_norm": 17.30284692112705, + "learning_rate": 7.638933899585354e-07, + "logits/chosen": -0.033752650022506714, + "logits/rejected": -0.008529474027454853, + "logps/chosen": -4.111490249633789, + "logps/rejected": -5.000385284423828, + "loss": 0.5555, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.111490249633789, + "rewards/margins": 0.8888949155807495, + "rewards/rejected": -5.000385284423828, + "sft_loss": 4.2806501388549805, + "step": 2190 + }, + { + "epoch": 1.1747783910352902, + "grad_norm": 16.467697801821306, + "learning_rate": 7.625692901064573e-07, + "logits/chosen": -0.11014772951602936, + "logits/rejected": -0.02117350697517395, + "logps/chosen": -4.215609550476074, + "logps/rejected": -5.145616054534912, + "loss": 0.5854, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.215609550476074, + "rewards/margins": 0.9300066232681274, + "rewards/rejected": -5.145616054534912, + "sft_loss": 4.365790367126465, + "step": 2195 + }, + { + "epoch": 1.1774544238166917, + "grad_norm": 15.112508542730295, + "learning_rate": 7.61242642805975e-07, + "logits/chosen": -0.10207013040781021, + "logits/rejected": -0.10883428156375885, + "logps/chosen": -4.202320575714111, + "logps/rejected": -5.06498384475708, + "loss": 0.5486, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.202320575714111, + "rewards/margins": 0.8626632690429688, + "rewards/rejected": -5.06498384475708, + "sft_loss": 4.304457664489746, + "step": 2200 + }, + { + "epoch": 1.1801304565980932, + "grad_norm": 20.42638434211248, + "learning_rate": 7.599134609282266e-07, + "logits/chosen": -0.16497863829135895, + "logits/rejected": -0.026081090793013573, + "logps/chosen": -4.316076755523682, + "logps/rejected": -5.126869201660156, + "loss": 0.5781, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.316076755523682, + "rewards/margins": 0.8107931017875671, + "rewards/rejected": -5.126869201660156, + "sft_loss": 4.403500556945801, + "step": 2205 + }, + { + "epoch": 1.182806489379495, + "grad_norm": 21.271489259140914, + "learning_rate": 7.585817573689402e-07, + "logits/chosen": -0.17145070433616638, + "logits/rejected": -0.0644187331199646, + "logps/chosen": -4.0592265129089355, + "logps/rejected": -5.11184024810791, + "loss": 0.4993, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.0592265129089355, + "rewards/margins": 1.052613615989685, + "rewards/rejected": -5.11184024810791, + "sft_loss": 4.157874584197998, + "step": 2210 + }, + { + "epoch": 1.1854825221608964, + "grad_norm": 16.32211937915042, + "learning_rate": 7.572475450483098e-07, + "logits/chosen": -0.158425971865654, + "logits/rejected": -0.09455561637878418, + "logps/chosen": -4.253739833831787, + "logps/rejected": -5.023743629455566, + "loss": 0.587, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.253739833831787, + "rewards/margins": 0.7700031995773315, + "rewards/rejected": -5.023743629455566, + "sft_loss": 4.278241157531738, + "step": 2215 + }, + { + "epoch": 1.188158554942298, + "grad_norm": 17.160418022349273, + "learning_rate": 7.559108369108689e-07, + "logits/chosen": -0.18763625621795654, + "logits/rejected": -0.08890589326620102, + "logps/chosen": -4.00942325592041, + "logps/rejected": -4.843209266662598, + "loss": 0.5768, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.00942325592041, + "rewards/margins": 0.833785891532898, + "rewards/rejected": -4.843209266662598, + "sft_loss": 4.075999736785889, + "step": 2220 + }, + { + "epoch": 1.1908345877236997, + "grad_norm": 12.458562499903586, + "learning_rate": 7.54571645925366e-07, + "logits/chosen": -0.20564058423042297, + "logits/rejected": 0.01137080229818821, + "logps/chosen": -3.936244487762451, + "logps/rejected": -5.025080680847168, + "loss": 0.4974, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.936244487762451, + "rewards/margins": 1.088836908340454, + "rewards/rejected": -5.025080680847168, + "sft_loss": 4.006060600280762, + "step": 2225 + }, + { + "epoch": 1.1935106205051011, + "grad_norm": 16.745086145900924, + "learning_rate": 7.532299850846378e-07, + "logits/chosen": -0.18450811505317688, + "logits/rejected": -0.04890236631035805, + "logps/chosen": -3.865755558013916, + "logps/rejected": -4.929928779602051, + "loss": 0.5334, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.865755558013916, + "rewards/margins": 1.0641729831695557, + "rewards/rejected": -4.929928779602051, + "sft_loss": 3.895325183868408, + "step": 2230 + }, + { + "epoch": 1.1961866532865026, + "grad_norm": 29.180103058843205, + "learning_rate": 7.518858674054838e-07, + "logits/chosen": -0.14506229758262634, + "logits/rejected": 0.006264629773795605, + "logps/chosen": -3.9540493488311768, + "logps/rejected": -4.871550559997559, + "loss": 0.5514, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.9540493488311768, + "rewards/margins": 0.9175008535385132, + "rewards/rejected": -4.871550559997559, + "sft_loss": 3.948908567428589, + "step": 2235 + }, + { + "epoch": 1.1988626860679044, + "grad_norm": 16.713002193962918, + "learning_rate": 7.505393059285394e-07, + "logits/chosen": -0.13637515902519226, + "logits/rejected": 0.00962438527494669, + "logps/chosen": -4.014534950256348, + "logps/rejected": -4.793184757232666, + "loss": 0.5942, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.014534950256348, + "rewards/margins": 0.7786496877670288, + "rewards/rejected": -4.793184757232666, + "sft_loss": 4.10478401184082, + "step": 2240 + }, + { + "epoch": 1.2015387188493059, + "grad_norm": 22.288147043431373, + "learning_rate": 7.491903137181501e-07, + "logits/chosen": -0.10160605609416962, + "logits/rejected": -0.07597693055868149, + "logps/chosen": -3.868107318878174, + "logps/rejected": -4.712255954742432, + "loss": 0.5491, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.868107318878174, + "rewards/margins": 0.8441489338874817, + "rewards/rejected": -4.712255954742432, + "sft_loss": 3.9633774757385254, + "step": 2245 + }, + { + "epoch": 1.2042147516307076, + "grad_norm": 15.5423927398474, + "learning_rate": 7.478389038622441e-07, + "logits/chosen": -0.05929984897375107, + "logits/rejected": -0.025438731536269188, + "logps/chosen": -3.977214813232422, + "logps/rejected": -4.813974380493164, + "loss": 0.5684, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.977214813232422, + "rewards/margins": 0.8367594480514526, + "rewards/rejected": -4.813974380493164, + "sft_loss": 4.0244035720825195, + "step": 2250 + }, + { + "epoch": 1.206890784412109, + "grad_norm": 23.989756074132227, + "learning_rate": 7.46485089472206e-07, + "logits/chosen": -0.0868724137544632, + "logits/rejected": -0.020146001130342484, + "logps/chosen": -3.998081684112549, + "logps/rejected": -4.8043036460876465, + "loss": 0.5908, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.998081684112549, + "rewards/margins": 0.8062219619750977, + "rewards/rejected": -4.8043036460876465, + "sft_loss": 4.047082424163818, + "step": 2255 + }, + { + "epoch": 1.2095668171935106, + "grad_norm": 15.013028141264995, + "learning_rate": 7.451288836827487e-07, + "logits/chosen": -0.07611582428216934, + "logits/rejected": -0.07014383375644684, + "logps/chosen": -3.9512946605682373, + "logps/rejected": -4.656590461730957, + "loss": 0.5899, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.9512946605682373, + "rewards/margins": 0.7052956223487854, + "rewards/rejected": -4.656590461730957, + "sft_loss": 4.055122375488281, + "step": 2260 + }, + { + "epoch": 1.2122428499749123, + "grad_norm": 12.295896409912185, + "learning_rate": 7.437702996517869e-07, + "logits/chosen": -0.11540064960718155, + "logits/rejected": -0.031953103840351105, + "logps/chosen": -3.908743381500244, + "logps/rejected": -4.824851989746094, + "loss": 0.5311, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.908743381500244, + "rewards/margins": 0.916108250617981, + "rewards/rejected": -4.824851989746094, + "sft_loss": 4.039055824279785, + "step": 2265 + }, + { + "epoch": 1.2149188827563138, + "grad_norm": 20.32458374752934, + "learning_rate": 7.424093505603087e-07, + "logits/chosen": -0.19283311069011688, + "logits/rejected": -0.04803388565778732, + "logps/chosen": -3.9939074516296387, + "logps/rejected": -4.994405269622803, + "loss": 0.5022, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.9939074516296387, + "rewards/margins": 1.000497579574585, + "rewards/rejected": -4.994405269622803, + "sft_loss": 4.019697666168213, + "step": 2270 + }, + { + "epoch": 1.2175949155377153, + "grad_norm": 14.210381238979735, + "learning_rate": 7.410460496122482e-07, + "logits/chosen": -0.14989027380943298, + "logits/rejected": -0.026964152231812477, + "logps/chosen": -3.871859073638916, + "logps/rejected": -4.951968193054199, + "loss": 0.4942, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.871859073638916, + "rewards/margins": 1.0801092386245728, + "rewards/rejected": -4.951968193054199, + "sft_loss": 3.885855197906494, + "step": 2275 + }, + { + "epoch": 1.220270948319117, + "grad_norm": 18.383099955253005, + "learning_rate": 7.396804100343572e-07, + "logits/chosen": -0.14752139151096344, + "logits/rejected": 0.008709174580872059, + "logps/chosen": -3.8431155681610107, + "logps/rejected": -4.750420570373535, + "loss": 0.5279, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.8431155681610107, + "rewards/margins": 0.9073052406311035, + "rewards/rejected": -4.750420570373535, + "sft_loss": 3.9268157482147217, + "step": 2280 + }, + { + "epoch": 1.2229469811005185, + "grad_norm": 11.770440842002662, + "learning_rate": 7.383124450760768e-07, + "logits/chosen": -0.13080976903438568, + "logits/rejected": 0.026985500007867813, + "logps/chosen": -4.05759334564209, + "logps/rejected": -4.9776129722595215, + "loss": 0.533, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.05759334564209, + "rewards/margins": 0.9200197458267212, + "rewards/rejected": -4.9776129722595215, + "sft_loss": 4.099170684814453, + "step": 2285 + }, + { + "epoch": 1.22562301388192, + "grad_norm": 19.709789959596435, + "learning_rate": 7.369421680094091e-07, + "logits/chosen": -0.19222676753997803, + "logits/rejected": -0.062310922890901566, + "logps/chosen": -3.993795871734619, + "logps/rejected": -4.981151103973389, + "loss": 0.5684, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.993795871734619, + "rewards/margins": 0.9873548746109009, + "rewards/rejected": -4.981151103973389, + "sft_loss": 4.073160171508789, + "step": 2290 + }, + { + "epoch": 1.2282990466633217, + "grad_norm": 20.5759194425081, + "learning_rate": 7.355695921287881e-07, + "logits/chosen": -0.15541115403175354, + "logits/rejected": -0.08983412384986877, + "logps/chosen": -4.1916046142578125, + "logps/rejected": -4.978062629699707, + "loss": 0.6232, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.1916046142578125, + "rewards/margins": 0.7864580750465393, + "rewards/rejected": -4.978062629699707, + "sft_loss": 4.301588535308838, + "step": 2295 + }, + { + "epoch": 1.2309750794447232, + "grad_norm": 18.397865902564224, + "learning_rate": 7.341947307509513e-07, + "logits/chosen": -0.1119125708937645, + "logits/rejected": 0.007355786859989166, + "logps/chosen": -4.03453254699707, + "logps/rejected": -4.903344631195068, + "loss": 0.5729, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.03453254699707, + "rewards/margins": 0.8688125610351562, + "rewards/rejected": -4.903344631195068, + "sft_loss": 4.140660285949707, + "step": 2300 + }, + { + "epoch": 1.233651112226125, + "grad_norm": 13.963771138170584, + "learning_rate": 7.328175972148094e-07, + "logits/chosen": -0.12454034388065338, + "logits/rejected": -0.025773998349905014, + "logps/chosen": -4.224725723266602, + "logps/rejected": -5.127718925476074, + "loss": 0.5521, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.224725723266602, + "rewards/margins": 0.9029935598373413, + "rewards/rejected": -5.127718925476074, + "sft_loss": 4.250973701477051, + "step": 2305 + }, + { + "epoch": 1.2363271450075264, + "grad_norm": 16.055123565220338, + "learning_rate": 7.314382048813185e-07, + "logits/chosen": -0.11387109756469727, + "logits/rejected": 0.07982424646615982, + "logps/chosen": -4.00413179397583, + "logps/rejected": -5.042935371398926, + "loss": 0.5002, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.00413179397583, + "rewards/margins": 1.0388035774230957, + "rewards/rejected": -5.042935371398926, + "sft_loss": 4.035943031311035, + "step": 2310 + }, + { + "epoch": 1.2390031777889279, + "grad_norm": 13.380973620447216, + "learning_rate": 7.300565671333486e-07, + "logits/chosen": -0.1749085783958435, + "logits/rejected": 0.003732487093657255, + "logps/chosen": -4.17561674118042, + "logps/rejected": -5.078269004821777, + "loss": 0.5367, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.17561674118042, + "rewards/margins": 0.9026520848274231, + "rewards/rejected": -5.078269004821777, + "sft_loss": 4.265914440155029, + "step": 2315 + }, + { + "epoch": 1.2416792105703296, + "grad_norm": 13.500724126105808, + "learning_rate": 7.286726973755554e-07, + "logits/chosen": -0.04895884171128273, + "logits/rejected": -0.010584674775600433, + "logps/chosen": -4.067770481109619, + "logps/rejected": -5.008973598480225, + "loss": 0.5079, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.067770481109619, + "rewards/margins": 0.9412034153938293, + "rewards/rejected": -5.008973598480225, + "sft_loss": 4.119451999664307, + "step": 2320 + }, + { + "epoch": 1.244355243351731, + "grad_norm": 13.896592196030422, + "learning_rate": 7.272866090342493e-07, + "logits/chosen": -0.03926707059144974, + "logits/rejected": 0.00505078723654151, + "logps/chosen": -4.068405628204346, + "logps/rejected": -5.059308052062988, + "loss": 0.4772, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.068405628204346, + "rewards/margins": 0.9909025430679321, + "rewards/rejected": -5.059308052062988, + "sft_loss": 4.019812107086182, + "step": 2325 + }, + { + "epoch": 1.2470312761331326, + "grad_norm": 18.523891805911703, + "learning_rate": 7.258983155572656e-07, + "logits/chosen": -0.13083158433437347, + "logits/rejected": -0.04379258677363396, + "logps/chosen": -4.155807971954346, + "logps/rejected": -5.062368869781494, + "loss": 0.5736, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.155807971954346, + "rewards/margins": 0.9065613746643066, + "rewards/rejected": -5.062368869781494, + "sft_loss": 4.245884895324707, + "step": 2330 + }, + { + "epoch": 1.2497073089145343, + "grad_norm": 16.341347028939982, + "learning_rate": 7.245078304138335e-07, + "logits/chosen": -0.08317460119724274, + "logits/rejected": -0.008221238851547241, + "logps/chosen": -4.245736122131348, + "logps/rejected": -5.182717800140381, + "loss": 0.5387, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.245736122131348, + "rewards/margins": 0.9369821548461914, + "rewards/rejected": -5.182717800140381, + "sft_loss": 4.267935752868652, + "step": 2335 + }, + { + "epoch": 1.2523833416959358, + "grad_norm": 16.1921418325257, + "learning_rate": 7.231151670944462e-07, + "logits/chosen": -0.21621200442314148, + "logits/rejected": -0.05815199017524719, + "logps/chosen": -4.295234680175781, + "logps/rejected": -5.175727844238281, + "loss": 0.5664, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.295234680175781, + "rewards/margins": 0.8804932832717896, + "rewards/rejected": -5.175727844238281, + "sft_loss": 4.34125280380249, + "step": 2340 + }, + { + "epoch": 1.2550593744773373, + "grad_norm": 14.671338560540171, + "learning_rate": 7.217203391107291e-07, + "logits/chosen": -0.1463036984205246, + "logits/rejected": 0.0023394287563860416, + "logps/chosen": -4.261254787445068, + "logps/rejected": -5.200662612915039, + "loss": 0.5502, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.261254787445068, + "rewards/margins": 0.9394083023071289, + "rewards/rejected": -5.200662612915039, + "sft_loss": 4.298597812652588, + "step": 2345 + }, + { + "epoch": 1.257735407258739, + "grad_norm": 14.786302419959824, + "learning_rate": 7.203233599953096e-07, + "logits/chosen": -0.12140548229217529, + "logits/rejected": 0.021579179912805557, + "logps/chosen": -4.222411155700684, + "logps/rejected": -5.156377792358398, + "loss": 0.5386, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.222411155700684, + "rewards/margins": 0.9339669346809387, + "rewards/rejected": -5.156377792358398, + "sft_loss": 4.266728401184082, + "step": 2350 + }, + { + "epoch": 1.2604114400401405, + "grad_norm": 22.892584684084042, + "learning_rate": 7.189242433016852e-07, + "logits/chosen": -0.12190501391887665, + "logits/rejected": 0.003065797733142972, + "logps/chosen": -4.157485485076904, + "logps/rejected": -5.230223655700684, + "loss": 0.5172, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.157485485076904, + "rewards/margins": 1.0727381706237793, + "rewards/rejected": -5.230223655700684, + "sft_loss": 4.248648643493652, + "step": 2355 + }, + { + "epoch": 1.263087472821542, + "grad_norm": 21.165580208370535, + "learning_rate": 7.17523002604092e-07, + "logits/chosen": -0.12997111678123474, + "logits/rejected": 0.008958925493061543, + "logps/chosen": -4.3522419929504395, + "logps/rejected": -5.354263782501221, + "loss": 0.5413, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.3522419929504395, + "rewards/margins": 1.0020217895507812, + "rewards/rejected": -5.354263782501221, + "sft_loss": 4.560755729675293, + "step": 2360 + }, + { + "epoch": 1.2657635056029437, + "grad_norm": 18.520445412017608, + "learning_rate": 7.161196514973734e-07, + "logits/chosen": -0.12115871906280518, + "logits/rejected": 0.006101625971496105, + "logps/chosen": -4.234294891357422, + "logps/rejected": -5.240196228027344, + "loss": 0.5312, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.234294891357422, + "rewards/margins": 1.005900502204895, + "rewards/rejected": -5.240196228027344, + "sft_loss": 4.323896408081055, + "step": 2365 + }, + { + "epoch": 1.2684395383843452, + "grad_norm": 18.94162664238945, + "learning_rate": 7.147142035968483e-07, + "logits/chosen": -0.060700200498104095, + "logits/rejected": 0.08092973381280899, + "logps/chosen": -4.362667560577393, + "logps/rejected": -5.29547643661499, + "loss": 0.5441, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.362667560577393, + "rewards/margins": 0.9328088760375977, + "rewards/rejected": -5.29547643661499, + "sft_loss": 4.4554314613342285, + "step": 2370 + }, + { + "epoch": 1.2711155711657467, + "grad_norm": 16.089707699168432, + "learning_rate": 7.133066725381781e-07, + "logits/chosen": -0.17727595567703247, + "logits/rejected": -0.028062384575605392, + "logps/chosen": -4.135886192321777, + "logps/rejected": -5.120160102844238, + "loss": 0.5319, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.135886192321777, + "rewards/margins": 0.9842736124992371, + "rewards/rejected": -5.120160102844238, + "sft_loss": 4.2006707191467285, + "step": 2375 + }, + { + "epoch": 1.2737916039471484, + "grad_norm": 16.925599283103264, + "learning_rate": 7.118970719772354e-07, + "logits/chosen": -0.1306312084197998, + "logits/rejected": 0.017275024205446243, + "logps/chosen": -4.3715691566467285, + "logps/rejected": -5.420563697814941, + "loss": 0.5397, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.3715691566467285, + "rewards/margins": 1.0489943027496338, + "rewards/rejected": -5.420563697814941, + "sft_loss": 4.410833835601807, + "step": 2380 + }, + { + "epoch": 1.27646763672855, + "grad_norm": 17.298361405272146, + "learning_rate": 7.104854155899711e-07, + "logits/chosen": -0.0684792622923851, + "logits/rejected": 0.0437152236700058, + "logps/chosen": -4.312630653381348, + "logps/rejected": -5.2664313316345215, + "loss": 0.5355, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.312630653381348, + "rewards/margins": 0.9538006782531738, + "rewards/rejected": -5.2664313316345215, + "sft_loss": 4.343523979187012, + "step": 2385 + }, + { + "epoch": 1.2791436695099514, + "grad_norm": 16.894139267941664, + "learning_rate": 7.090717170722817e-07, + "logits/chosen": -0.0749233216047287, + "logits/rejected": -0.028645822778344154, + "logps/chosen": -4.292330741882324, + "logps/rejected": -5.397002696990967, + "loss": 0.4933, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.292330741882324, + "rewards/margins": 1.1046717166900635, + "rewards/rejected": -5.397002696990967, + "sft_loss": 4.319766044616699, + "step": 2390 + }, + { + "epoch": 1.2818197022913531, + "grad_norm": 20.57306013082817, + "learning_rate": 7.076559901398762e-07, + "logits/chosen": -0.19145503640174866, + "logits/rejected": -0.09005744755268097, + "logps/chosen": -4.161319255828857, + "logps/rejected": -4.979227542877197, + "loss": 0.5577, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.161319255828857, + "rewards/margins": 0.8179081678390503, + "rewards/rejected": -4.979227542877197, + "sft_loss": 4.277467250823975, + "step": 2395 + }, + { + "epoch": 1.2844957350727546, + "grad_norm": 24.941469477287953, + "learning_rate": 7.062382485281436e-07, + "logits/chosen": -0.10934241861104965, + "logits/rejected": -0.02125650644302368, + "logps/chosen": -4.162993431091309, + "logps/rejected": -4.937913417816162, + "loss": 0.5774, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.162993431091309, + "rewards/margins": 0.7749199867248535, + "rewards/rejected": -4.937913417816162, + "sft_loss": 4.256572723388672, + "step": 2400 + }, + { + "epoch": 1.2844957350727546, + "eval_logits/chosen": 0.1579088568687439, + "eval_logits/rejected": 0.24521751701831818, + "eval_logps/chosen": -4.261503219604492, + "eval_logps/rejected": -5.1611127853393555, + "eval_loss": 0.5776896476745605, + "eval_rewards/accuracies": 0.7277448177337646, + "eval_rewards/chosen": -4.261503219604492, + "eval_rewards/margins": 0.8996095657348633, + "eval_rewards/rejected": -5.1611127853393555, + "eval_runtime": 43.0968, + "eval_samples_per_second": 31.209, + "eval_sft_loss": 4.317946434020996, + "eval_steps_per_second": 7.82, + "step": 2400 + }, + { + "epoch": 1.287171767854156, + "grad_norm": 13.250023970937631, + "learning_rate": 7.048185059920193e-07, + "logits/chosen": -0.09466713666915894, + "logits/rejected": 0.034652967005968094, + "logps/chosen": -4.163640022277832, + "logps/rejected": -5.244973659515381, + "loss": 0.5412, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.163640022277832, + "rewards/margins": 1.081333875656128, + "rewards/rejected": -5.244973659515381, + "sft_loss": 4.2320756912231445, + "step": 2405 + }, + { + "epoch": 1.2898478006355578, + "grad_norm": 18.225924804438332, + "learning_rate": 7.033967763058516e-07, + "logits/chosen": -0.15738347172737122, + "logits/rejected": 0.001196927623823285, + "logps/chosen": -4.119842052459717, + "logps/rejected": -4.949393272399902, + "loss": 0.5431, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.119842052459717, + "rewards/margins": 0.8295515775680542, + "rewards/rejected": -4.949393272399902, + "sft_loss": 4.182238578796387, + "step": 2410 + }, + { + "epoch": 1.2925238334169593, + "grad_norm": 13.813377337904992, + "learning_rate": 7.019730732632681e-07, + "logits/chosen": -0.02098652347922325, + "logits/rejected": 0.05877337604761124, + "logps/chosen": -4.1416335105896, + "logps/rejected": -5.199089527130127, + "loss": 0.5336, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.1416335105896, + "rewards/margins": 1.0574554204940796, + "rewards/rejected": -5.199089527130127, + "sft_loss": 4.261752605438232, + "step": 2415 + }, + { + "epoch": 1.2951998661983608, + "grad_norm": 13.803338758968538, + "learning_rate": 7.005474106770418e-07, + "logits/chosen": -0.13239726424217224, + "logits/rejected": -0.02377437800168991, + "logps/chosen": -4.052218914031982, + "logps/rejected": -5.084633827209473, + "loss": 0.5173, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.052218914031982, + "rewards/margins": 1.0324145555496216, + "rewards/rejected": -5.084633827209473, + "sft_loss": 4.154684066772461, + "step": 2420 + }, + { + "epoch": 1.2978758989797625, + "grad_norm": 15.467388748261115, + "learning_rate": 6.991198023789577e-07, + "logits/chosen": -0.08688588440418243, + "logits/rejected": 0.0056015015579760075, + "logps/chosen": -3.8696110248565674, + "logps/rejected": -4.658940315246582, + "loss": 0.5586, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.8696110248565674, + "rewards/margins": 0.7893290519714355, + "rewards/rejected": -4.658940315246582, + "sft_loss": 4.0217156410217285, + "step": 2425 + }, + { + "epoch": 1.300551931761164, + "grad_norm": 24.121078239161786, + "learning_rate": 6.976902622196776e-07, + "logits/chosen": -0.05252006649971008, + "logits/rejected": 0.01766449585556984, + "logps/chosen": -3.976597547531128, + "logps/rejected": -4.868682861328125, + "loss": 0.5583, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.976597547531128, + "rewards/margins": 0.8920855522155762, + "rewards/rejected": -4.868682861328125, + "sft_loss": 3.9951343536376953, + "step": 2430 + }, + { + "epoch": 1.3032279645425655, + "grad_norm": 13.581396643669095, + "learning_rate": 6.962588040686064e-07, + "logits/chosen": -0.07369393110275269, + "logits/rejected": 0.05318892002105713, + "logps/chosen": -3.871485948562622, + "logps/rejected": -4.692887783050537, + "loss": 0.5965, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.871485948562622, + "rewards/margins": 0.8214018940925598, + "rewards/rejected": -4.692887783050537, + "sft_loss": 3.947432279586792, + "step": 2435 + }, + { + "epoch": 1.3059039973239672, + "grad_norm": 16.035934350930617, + "learning_rate": 6.948254418137573e-07, + "logits/chosen": -0.07468081265687943, + "logits/rejected": 0.03066675364971161, + "logps/chosen": -3.855999708175659, + "logps/rejected": -4.732884883880615, + "loss": 0.5653, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.855999708175659, + "rewards/margins": 0.8768857717514038, + "rewards/rejected": -4.732884883880615, + "sft_loss": 3.871295928955078, + "step": 2440 + }, + { + "epoch": 1.3085800301053687, + "grad_norm": 22.99980782696124, + "learning_rate": 6.933901893616174e-07, + "logits/chosen": -0.1008199006319046, + "logits/rejected": 0.03632703796029091, + "logps/chosen": -3.902106761932373, + "logps/rejected": -4.686600208282471, + "loss": 0.5728, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.902106761932373, + "rewards/margins": 0.7844932675361633, + "rewards/rejected": -4.686600208282471, + "sft_loss": 3.9292640686035156, + "step": 2445 + }, + { + "epoch": 1.3112560628867704, + "grad_norm": 23.013156543428995, + "learning_rate": 6.919530606370121e-07, + "logits/chosen": -0.03523783013224602, + "logits/rejected": 0.08227065950632095, + "logps/chosen": -3.7815330028533936, + "logps/rejected": -4.665060520172119, + "loss": 0.536, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.7815330028533936, + "rewards/margins": 0.8835276365280151, + "rewards/rejected": -4.665060520172119, + "sft_loss": 3.8153178691864014, + "step": 2450 + }, + { + "epoch": 1.313932095668172, + "grad_norm": 15.170455557689014, + "learning_rate": 6.905140695829706e-07, + "logits/chosen": -0.1345018893480301, + "logits/rejected": 0.09576258063316345, + "logps/chosen": -3.930418014526367, + "logps/rejected": -4.801162242889404, + "loss": 0.5256, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.930418014526367, + "rewards/margins": 0.8707441091537476, + "rewards/rejected": -4.801162242889404, + "sft_loss": 3.9338157176971436, + "step": 2455 + }, + { + "epoch": 1.3166081284495736, + "grad_norm": 25.67041425851637, + "learning_rate": 6.890732301605904e-07, + "logits/chosen": -0.059803593903779984, + "logits/rejected": 0.02915279194712639, + "logps/chosen": -3.8801674842834473, + "logps/rejected": -4.6841230392456055, + "loss": 0.586, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.8801674842834473, + "rewards/margins": 0.8039555549621582, + "rewards/rejected": -4.6841230392456055, + "sft_loss": 3.8844990730285645, + "step": 2460 + }, + { + "epoch": 1.3192841612309751, + "grad_norm": 14.242351838450366, + "learning_rate": 6.876305563489021e-07, + "logits/chosen": -0.06666535884141922, + "logits/rejected": 0.013455508276820183, + "logps/chosen": -3.9759879112243652, + "logps/rejected": -4.992976665496826, + "loss": 0.4987, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.9759879112243652, + "rewards/margins": 1.0169888734817505, + "rewards/rejected": -4.992976665496826, + "sft_loss": 3.9486610889434814, + "step": 2465 + }, + { + "epoch": 1.3219601940123766, + "grad_norm": 19.94100490761148, + "learning_rate": 6.861860621447331e-07, + "logits/chosen": -0.14499233663082123, + "logits/rejected": -0.04203890636563301, + "logps/chosen": -4.115070343017578, + "logps/rejected": -4.816925525665283, + "loss": 0.6046, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.115070343017578, + "rewards/margins": 0.7018548250198364, + "rewards/rejected": -4.816925525665283, + "sft_loss": 4.2145185470581055, + "step": 2470 + }, + { + "epoch": 1.3246362267937783, + "grad_norm": 19.44598133863135, + "learning_rate": 6.847397615625725e-07, + "logits/chosen": -0.04176859185099602, + "logits/rejected": 0.0038140460383147, + "logps/chosen": -4.126821041107178, + "logps/rejected": -4.9137067794799805, + "loss": 0.5781, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.126821041107178, + "rewards/margins": 0.7868860960006714, + "rewards/rejected": -4.9137067794799805, + "sft_loss": 4.189349174499512, + "step": 2475 + }, + { + "epoch": 1.3273122595751798, + "grad_norm": 13.139796574332802, + "learning_rate": 6.83291668634435e-07, + "logits/chosen": -0.12218749523162842, + "logits/rejected": 0.060198068618774414, + "logps/chosen": -4.126533031463623, + "logps/rejected": -5.21274471282959, + "loss": 0.5015, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.126533031463623, + "rewards/margins": 1.086211085319519, + "rewards/rejected": -5.21274471282959, + "sft_loss": 4.309605121612549, + "step": 2480 + }, + { + "epoch": 1.3299882923565813, + "grad_norm": 15.664523423520645, + "learning_rate": 6.818417974097246e-07, + "logits/chosen": 0.04256322979927063, + "logits/rejected": 0.18730340898036957, + "logps/chosen": -3.963308811187744, + "logps/rejected": -5.152323246002197, + "loss": 0.4905, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.963308811187744, + "rewards/margins": 1.1890140771865845, + "rewards/rejected": -5.152323246002197, + "sft_loss": 4.1547064781188965, + "step": 2485 + }, + { + "epoch": 1.332664325137983, + "grad_norm": 14.557563994969184, + "learning_rate": 6.803901619550981e-07, + "logits/chosen": -0.08105379343032837, + "logits/rejected": -0.021051811054348946, + "logps/chosen": -4.075659275054932, + "logps/rejected": -5.015525817871094, + "loss": 0.5219, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.075659275054932, + "rewards/margins": 0.9398663640022278, + "rewards/rejected": -5.015525817871094, + "sft_loss": 4.149325370788574, + "step": 2490 + }, + { + "epoch": 1.3353403579193845, + "grad_norm": 16.459628588276328, + "learning_rate": 6.789367763543292e-07, + "logits/chosen": 0.0012557022273540497, + "logits/rejected": 0.03740160912275314, + "logps/chosen": -4.035196781158447, + "logps/rejected": -4.862973213195801, + "loss": 0.5951, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.035196781158447, + "rewards/margins": 0.827777087688446, + "rewards/rejected": -4.862973213195801, + "sft_loss": 4.138479232788086, + "step": 2495 + }, + { + "epoch": 1.338016390700786, + "grad_norm": 20.5569062334297, + "learning_rate": 6.774816547081714e-07, + "logits/chosen": -0.02261544018983841, + "logits/rejected": 0.12358323484659195, + "logps/chosen": -3.9922146797180176, + "logps/rejected": -4.67208194732666, + "loss": 0.5743, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.9922146797180176, + "rewards/margins": 0.6798672676086426, + "rewards/rejected": -4.67208194732666, + "sft_loss": 4.113407135009766, + "step": 2500 + }, + { + "epoch": 1.3406924234821878, + "grad_norm": 14.390536620434528, + "learning_rate": 6.760248111342211e-07, + "logits/chosen": -0.03141166269779205, + "logits/rejected": 0.12173942476511002, + "logps/chosen": -3.8743910789489746, + "logps/rejected": -4.843568325042725, + "loss": 0.5178, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.8743910789489746, + "rewards/margins": 0.9691771268844604, + "rewards/rejected": -4.843568325042725, + "sft_loss": 3.890059232711792, + "step": 2505 + }, + { + "epoch": 1.3433684562635893, + "grad_norm": 14.954853828357361, + "learning_rate": 6.745662597667813e-07, + "logits/chosen": -0.09151516109704971, + "logits/rejected": 0.03339110687375069, + "logps/chosen": -3.8521568775177, + "logps/rejected": -4.75100040435791, + "loss": 0.5163, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.8521568775177, + "rewards/margins": 0.8988434672355652, + "rewards/rejected": -4.75100040435791, + "sft_loss": 3.9217453002929688, + "step": 2510 + }, + { + "epoch": 1.3460444890449907, + "grad_norm": 14.922121712243158, + "learning_rate": 6.731060147567236e-07, + "logits/chosen": -0.00676008453592658, + "logits/rejected": 0.0773114562034607, + "logps/chosen": -3.9265663623809814, + "logps/rejected": -4.898476600646973, + "loss": 0.5228, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.9265663623809814, + "rewards/margins": 0.971910297870636, + "rewards/rejected": -4.898476600646973, + "sft_loss": 4.014676094055176, + "step": 2515 + }, + { + "epoch": 1.3487205218263925, + "grad_norm": 17.26823540006087, + "learning_rate": 6.716440902713515e-07, + "logits/chosen": -0.07249848544597626, + "logits/rejected": 0.0011456996435299516, + "logps/chosen": -4.010064125061035, + "logps/rejected": -4.87760066986084, + "loss": 0.52, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.010064125061035, + "rewards/margins": 0.8675360679626465, + "rewards/rejected": -4.87760066986084, + "sft_loss": 3.9492244720458984, + "step": 2520 + }, + { + "epoch": 1.351396554607794, + "grad_norm": 18.332281848753983, + "learning_rate": 6.701805004942627e-07, + "logits/chosen": -0.05388345569372177, + "logits/rejected": 0.013353681191802025, + "logps/chosen": -4.10679817199707, + "logps/rejected": -5.018848419189453, + "loss": 0.5434, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.10679817199707, + "rewards/margins": 0.9120498895645142, + "rewards/rejected": -5.018848419189453, + "sft_loss": 4.238326072692871, + "step": 2525 + }, + { + "epoch": 1.3540725873891954, + "grad_norm": 23.35264730325701, + "learning_rate": 6.687152596252119e-07, + "logits/chosen": -0.0372026227414608, + "logits/rejected": 0.033376261591911316, + "logps/chosen": -4.218406677246094, + "logps/rejected": -5.008605003356934, + "loss": 0.6154, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.218406677246094, + "rewards/margins": 0.7901977896690369, + "rewards/rejected": -5.008605003356934, + "sft_loss": 4.312189102172852, + "step": 2530 + }, + { + "epoch": 1.3567486201705972, + "grad_norm": 23.98823359034702, + "learning_rate": 6.672483818799722e-07, + "logits/chosen": -0.11207201331853867, + "logits/rejected": 0.017515579238533974, + "logps/chosen": -4.183786869049072, + "logps/rejected": -5.026003837585449, + "loss": 0.5648, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.183786869049072, + "rewards/margins": 0.8422168493270874, + "rewards/rejected": -5.026003837585449, + "sft_loss": 4.257922649383545, + "step": 2535 + }, + { + "epoch": 1.3594246529519987, + "grad_norm": 18.23902876924883, + "learning_rate": 6.657798814901978e-07, + "logits/chosen": -0.07603417336940765, + "logits/rejected": 0.06636123359203339, + "logps/chosen": -4.205514907836914, + "logps/rejected": -5.013917446136475, + "loss": 0.5752, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.205514907836914, + "rewards/margins": 0.8084025382995605, + "rewards/rejected": -5.013917446136475, + "sft_loss": 4.290884971618652, + "step": 2540 + }, + { + "epoch": 1.3621006857334002, + "grad_norm": 17.576043961769646, + "learning_rate": 6.643097727032863e-07, + "logits/chosen": -0.07821284234523773, + "logits/rejected": 0.10023043304681778, + "logps/chosen": -4.091734886169434, + "logps/rejected": -5.130631923675537, + "loss": 0.4947, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.091734886169434, + "rewards/margins": 1.0388973951339722, + "rewards/rejected": -5.130631923675537, + "sft_loss": 4.176156044006348, + "step": 2545 + }, + { + "epoch": 1.3647767185148019, + "grad_norm": 17.31891971556111, + "learning_rate": 6.628380697822392e-07, + "logits/chosen": -0.041422732174396515, + "logits/rejected": 0.0944138616323471, + "logps/chosen": -4.169663429260254, + "logps/rejected": -4.939678192138672, + "loss": 0.575, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.169663429260254, + "rewards/margins": 0.7700151205062866, + "rewards/rejected": -4.939678192138672, + "sft_loss": 4.2203521728515625, + "step": 2550 + }, + { + "epoch": 1.3674527512962034, + "grad_norm": 17.980562333891605, + "learning_rate": 6.61364787005525e-07, + "logits/chosen": -0.026197027415037155, + "logits/rejected": 0.08606614917516708, + "logps/chosen": -3.9951393604278564, + "logps/rejected": -5.0169453620910645, + "loss": 0.5218, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.9951393604278564, + "rewards/margins": 1.0218056440353394, + "rewards/rejected": -5.0169453620910645, + "sft_loss": 4.092293739318848, + "step": 2555 + }, + { + "epoch": 1.3701287840776049, + "grad_norm": 18.965850339952567, + "learning_rate": 6.598899386669395e-07, + "logits/chosen": 0.0009241849184036255, + "logits/rejected": 0.10314282029867172, + "logps/chosen": -3.983647584915161, + "logps/rejected": -4.829386234283447, + "loss": 0.5697, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.983647584915161, + "rewards/margins": 0.8457382321357727, + "rewards/rejected": -4.829386234283447, + "sft_loss": 4.019073486328125, + "step": 2560 + }, + { + "epoch": 1.3728048168590066, + "grad_norm": 22.059341130807468, + "learning_rate": 6.584135390754679e-07, + "logits/chosen": -0.016916906461119652, + "logits/rejected": 0.09965212643146515, + "logps/chosen": -3.9172863960266113, + "logps/rejected": -4.917275428771973, + "loss": 0.5232, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.9172863960266113, + "rewards/margins": 0.9999886751174927, + "rewards/rejected": -4.917275428771973, + "sft_loss": 3.9654273986816406, + "step": 2565 + }, + { + "epoch": 1.375480849640408, + "grad_norm": 12.239812031955307, + "learning_rate": 6.569356025551454e-07, + "logits/chosen": -0.04140595346689224, + "logits/rejected": 0.042737144976854324, + "logps/chosen": -3.9178566932678223, + "logps/rejected": -4.855074405670166, + "loss": 0.5396, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.9178566932678223, + "rewards/margins": 0.9372177124023438, + "rewards/rejected": -4.855074405670166, + "sft_loss": 3.9111125469207764, + "step": 2570 + }, + { + "epoch": 1.3781568824218096, + "grad_norm": 16.9497595753261, + "learning_rate": 6.554561434449186e-07, + "logits/chosen": -0.11973029375076294, + "logits/rejected": 0.015954116359353065, + "logps/chosen": -4.019374370574951, + "logps/rejected": -4.899747848510742, + "loss": 0.5561, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.019374370574951, + "rewards/margins": 0.8803732991218567, + "rewards/rejected": -4.899747848510742, + "sft_loss": 4.068663120269775, + "step": 2575 + }, + { + "epoch": 1.3808329152032113, + "grad_norm": 21.405099023095033, + "learning_rate": 6.539751760985063e-07, + "logits/chosen": -0.06440500169992447, + "logits/rejected": 0.015679899603128433, + "logps/chosen": -4.161512851715088, + "logps/rejected": -4.838021278381348, + "loss": 0.5979, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.161512851715088, + "rewards/margins": 0.6765087842941284, + "rewards/rejected": -4.838021278381348, + "sft_loss": 4.2326836585998535, + "step": 2580 + }, + { + "epoch": 1.3835089479846128, + "grad_norm": 18.492800267020584, + "learning_rate": 6.524927148842602e-07, + "logits/chosen": 0.027476992458105087, + "logits/rejected": 0.166078582406044, + "logps/chosen": -4.085366249084473, + "logps/rejected": -4.959372520446777, + "loss": 0.5506, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.085366249084473, + "rewards/margins": 0.8740068674087524, + "rewards/rejected": -4.959372520446777, + "sft_loss": 4.118053913116455, + "step": 2585 + }, + { + "epoch": 1.3861849807660143, + "grad_norm": 22.051717275566087, + "learning_rate": 6.510087741850254e-07, + "logits/chosen": -0.07167111337184906, + "logits/rejected": 0.03960205242037773, + "logps/chosen": -3.994276523590088, + "logps/rejected": -4.847044944763184, + "loss": 0.5662, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.994276523590088, + "rewards/margins": 0.8527683019638062, + "rewards/rejected": -4.847044944763184, + "sft_loss": 4.117629051208496, + "step": 2590 + }, + { + "epoch": 1.388861013547416, + "grad_norm": 18.763676766189004, + "learning_rate": 6.495233683980012e-07, + "logits/chosen": -0.029994597658514977, + "logits/rejected": 0.01790265180170536, + "logps/chosen": -4.043631553649902, + "logps/rejected": -4.825007438659668, + "loss": 0.5664, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.043631553649902, + "rewards/margins": 0.7813760042190552, + "rewards/rejected": -4.825007438659668, + "sft_loss": 4.099611759185791, + "step": 2595 + }, + { + "epoch": 1.3915370463288175, + "grad_norm": 22.383869694507865, + "learning_rate": 6.480365119346011e-07, + "logits/chosen": 0.072385273873806, + "logits/rejected": 0.1959836781024933, + "logps/chosen": -4.041221618652344, + "logps/rejected": -4.9139838218688965, + "loss": 0.5435, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.041221618652344, + "rewards/margins": 0.8727631568908691, + "rewards/rejected": -4.9139838218688965, + "sft_loss": 4.137312889099121, + "step": 2600 + }, + { + "epoch": 1.394213079110219, + "grad_norm": 14.148402012039993, + "learning_rate": 6.465482192203129e-07, + "logits/chosen": 0.07596292346715927, + "logits/rejected": 0.14356279373168945, + "logps/chosen": -4.043821811676025, + "logps/rejected": -4.930019378662109, + "loss": 0.5302, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.043821811676025, + "rewards/margins": 0.8861970901489258, + "rewards/rejected": -4.930019378662109, + "sft_loss": 4.203348159790039, + "step": 2605 + }, + { + "epoch": 1.3968891118916207, + "grad_norm": 24.014892783435013, + "learning_rate": 6.45058504694559e-07, + "logits/chosen": 0.05701693147420883, + "logits/rejected": 0.10021479427814484, + "logps/chosen": -4.121259689331055, + "logps/rejected": -5.054667949676514, + "loss": 0.5315, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.121259689331055, + "rewards/margins": 0.9334084391593933, + "rewards/rejected": -5.054667949676514, + "sft_loss": 4.234942436218262, + "step": 2610 + }, + { + "epoch": 1.3995651446730222, + "grad_norm": 20.26942892267099, + "learning_rate": 6.435673828105564e-07, + "logits/chosen": -0.007051569409668446, + "logits/rejected": 0.11115667968988419, + "logps/chosen": -4.124650001525879, + "logps/rejected": -5.125641822814941, + "loss": 0.5469, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.124650001525879, + "rewards/margins": 1.0009920597076416, + "rewards/rejected": -5.125641822814941, + "sft_loss": 4.286810874938965, + "step": 2615 + }, + { + "epoch": 1.402241177454424, + "grad_norm": 17.584308254258314, + "learning_rate": 6.420748680351763e-07, + "logits/chosen": 0.08284053951501846, + "logits/rejected": 0.036834098398685455, + "logps/chosen": -4.212587833404541, + "logps/rejected": -4.950636863708496, + "loss": 0.5971, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.212587833404541, + "rewards/margins": 0.7380497455596924, + "rewards/rejected": -4.950636863708496, + "sft_loss": 4.395787239074707, + "step": 2620 + }, + { + "epoch": 1.4049172102358254, + "grad_norm": 26.301576396692624, + "learning_rate": 6.405809748488032e-07, + "logits/chosen": -0.02107442170381546, + "logits/rejected": 0.10732152312994003, + "logps/chosen": -4.238432884216309, + "logps/rejected": -5.290383815765381, + "loss": 0.5606, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.238432884216309, + "rewards/margins": 1.0519508123397827, + "rewards/rejected": -5.290383815765381, + "sft_loss": 4.277993202209473, + "step": 2625 + }, + { + "epoch": 1.4075932430172269, + "grad_norm": 17.321515931340176, + "learning_rate": 6.390857177451956e-07, + "logits/chosen": -0.10459005832672119, + "logits/rejected": 0.06394404172897339, + "logps/chosen": -4.208449363708496, + "logps/rejected": -5.058264255523682, + "loss": 0.5599, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.208449363708496, + "rewards/margins": 0.849814772605896, + "rewards/rejected": -5.058264255523682, + "sft_loss": 4.295122146606445, + "step": 2630 + }, + { + "epoch": 1.4102692757986286, + "grad_norm": 19.571155222145673, + "learning_rate": 6.375891112313445e-07, + "logits/chosen": -0.10111421346664429, + "logits/rejected": -0.006807476282119751, + "logps/chosen": -4.322181224822998, + "logps/rejected": -5.2172675132751465, + "loss": 0.5485, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.322181224822998, + "rewards/margins": 0.8950859904289246, + "rewards/rejected": -5.2172675132751465, + "sft_loss": 4.372712135314941, + "step": 2635 + }, + { + "epoch": 1.41294530858003, + "grad_norm": 16.4871101876668, + "learning_rate": 6.360911698273326e-07, + "logits/chosen": 0.013696163892745972, + "logits/rejected": 0.12533724308013916, + "logps/chosen": -4.365028381347656, + "logps/rejected": -5.152103424072266, + "loss": 0.5814, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.365028381347656, + "rewards/margins": 0.7870752215385437, + "rewards/rejected": -5.152103424072266, + "sft_loss": 4.409226894378662, + "step": 2640 + }, + { + "epoch": 1.4156213413614318, + "grad_norm": 14.975019621922833, + "learning_rate": 6.345919080661944e-07, + "logits/chosen": -0.014929696917533875, + "logits/rejected": 0.05198700353503227, + "logps/chosen": -4.0075507164001465, + "logps/rejected": -5.04404354095459, + "loss": 0.5027, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.0075507164001465, + "rewards/margins": 1.0364933013916016, + "rewards/rejected": -5.04404354095459, + "sft_loss": 4.074658393859863, + "step": 2645 + }, + { + "epoch": 1.4182973741428333, + "grad_norm": 21.222758938780224, + "learning_rate": 6.330913404937737e-07, + "logits/chosen": -0.0078008463606238365, + "logits/rejected": 0.10538405179977417, + "logps/chosen": -4.141799449920654, + "logps/rejected": -5.210144996643066, + "loss": 0.5247, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.141799449920654, + "rewards/margins": 1.0683459043502808, + "rewards/rejected": -5.210144996643066, + "sft_loss": 4.205315589904785, + "step": 2650 + }, + { + "epoch": 1.4209734069242348, + "grad_norm": 16.952020669846306, + "learning_rate": 6.315894816685838e-07, + "logits/chosen": -0.006896118633449078, + "logits/rejected": 0.13101527094841003, + "logps/chosen": -4.1590070724487305, + "logps/rejected": -5.018160820007324, + "loss": 0.5278, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.1590070724487305, + "rewards/margins": 0.859154224395752, + "rewards/rejected": -5.018160820007324, + "sft_loss": 4.295965671539307, + "step": 2655 + }, + { + "epoch": 1.4236494397056365, + "grad_norm": 15.126805683593512, + "learning_rate": 6.300863461616657e-07, + "logits/chosen": 0.029642198234796524, + "logits/rejected": 0.10298113524913788, + "logps/chosen": -4.103089809417725, + "logps/rejected": -4.830977439880371, + "loss": 0.616, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.103089809417725, + "rewards/margins": 0.7278872132301331, + "rewards/rejected": -4.830977439880371, + "sft_loss": 4.179484844207764, + "step": 2660 + }, + { + "epoch": 1.426325472487038, + "grad_norm": 15.129423535610094, + "learning_rate": 6.285819485564465e-07, + "logits/chosen": -0.0842280238866806, + "logits/rejected": 0.032838691025972366, + "logps/chosen": -4.095457553863525, + "logps/rejected": -5.005012035369873, + "loss": 0.5048, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.095457553863525, + "rewards/margins": 0.9095550775527954, + "rewards/rejected": -5.005012035369873, + "sft_loss": 4.185179710388184, + "step": 2665 + }, + { + "epoch": 1.4290015052684395, + "grad_norm": 18.110220036830906, + "learning_rate": 6.270763034485986e-07, + "logits/chosen": 0.041431643068790436, + "logits/rejected": 0.12850026786327362, + "logps/chosen": -4.218494892120361, + "logps/rejected": -5.118128776550293, + "loss": 0.533, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.218494892120361, + "rewards/margins": 0.8996332287788391, + "rewards/rejected": -5.118128776550293, + "sft_loss": 4.214810848236084, + "step": 2670 + }, + { + "epoch": 1.4316775380498412, + "grad_norm": 35.430231259525655, + "learning_rate": 6.255694254458972e-07, + "logits/chosen": -0.015107926912605762, + "logits/rejected": 0.11170251667499542, + "logps/chosen": -4.227860927581787, + "logps/rejected": -5.089137554168701, + "loss": 0.5851, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.227860927581787, + "rewards/margins": 0.8612769246101379, + "rewards/rejected": -5.089137554168701, + "sft_loss": 4.147841453552246, + "step": 2675 + }, + { + "epoch": 1.4343535708312427, + "grad_norm": 19.98694884716753, + "learning_rate": 6.240613291680795e-07, + "logits/chosen": -0.07201234996318817, + "logits/rejected": 0.07067441195249557, + "logps/chosen": -4.084803581237793, + "logps/rejected": -4.9614481925964355, + "loss": 0.5871, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.084803581237793, + "rewards/margins": 0.8766444325447083, + "rewards/rejected": -4.9614481925964355, + "sft_loss": 4.110752105712891, + "step": 2680 + }, + { + "epoch": 1.4370296036126442, + "grad_norm": 12.298144029591253, + "learning_rate": 6.225520292467021e-07, + "logits/chosen": -0.05858708545565605, + "logits/rejected": 0.1349249631166458, + "logps/chosen": -3.9142913818359375, + "logps/rejected": -5.051329612731934, + "loss": 0.4604, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.9142913818359375, + "rewards/margins": 1.137038230895996, + "rewards/rejected": -5.051329612731934, + "sft_loss": 4.005929946899414, + "step": 2685 + }, + { + "epoch": 1.439705636394046, + "grad_norm": 31.511770228889546, + "learning_rate": 6.210415403249993e-07, + "logits/chosen": -0.1398880034685135, + "logits/rejected": 0.0756937712430954, + "logps/chosen": -4.080639362335205, + "logps/rejected": -5.032453536987305, + "loss": 0.5697, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.080639362335205, + "rewards/margins": 0.9518140554428101, + "rewards/rejected": -5.032453536987305, + "sft_loss": 4.059521198272705, + "step": 2690 + }, + { + "epoch": 1.4423816691754474, + "grad_norm": 19.41232862754573, + "learning_rate": 6.195298770577415e-07, + "logits/chosen": 0.059480488300323486, + "logits/rejected": 0.07041338831186295, + "logps/chosen": -4.093963146209717, + "logps/rejected": -5.036952495574951, + "loss": 0.5553, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.093963146209717, + "rewards/margins": 0.9429893493652344, + "rewards/rejected": -5.036952495574951, + "sft_loss": 4.13309907913208, + "step": 2695 + }, + { + "epoch": 1.445057701956849, + "grad_norm": 17.476186313500772, + "learning_rate": 6.180170541110923e-07, + "logits/chosen": -0.04353252798318863, + "logits/rejected": 0.11172328144311905, + "logps/chosen": -4.150980472564697, + "logps/rejected": -5.078780174255371, + "loss": 0.5423, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.150980472564697, + "rewards/margins": 0.9278000593185425, + "rewards/rejected": -5.078780174255371, + "sft_loss": 4.235171318054199, + "step": 2700 + }, + { + "epoch": 1.4477337347382506, + "grad_norm": 15.38466246175128, + "learning_rate": 6.165030861624663e-07, + "logits/chosen": -0.0989823192358017, + "logits/rejected": 0.08282925188541412, + "logps/chosen": -4.078875541687012, + "logps/rejected": -5.304009437561035, + "loss": 0.4665, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.078875541687012, + "rewards/margins": 1.2251341342926025, + "rewards/rejected": -5.304009437561035, + "sft_loss": 4.070713996887207, + "step": 2705 + }, + { + "epoch": 1.4504097675196521, + "grad_norm": 18.130730530155056, + "learning_rate": 6.149879879003876e-07, + "logits/chosen": 0.03766113892197609, + "logits/rejected": 0.0644763633608818, + "logps/chosen": -4.121838569641113, + "logps/rejected": -5.137380123138428, + "loss": 0.5099, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.121838569641113, + "rewards/margins": 1.0155417919158936, + "rewards/rejected": -5.137380123138428, + "sft_loss": 4.17519998550415, + "step": 2710 + }, + { + "epoch": 1.4530858003010536, + "grad_norm": 15.753038422112043, + "learning_rate": 6.13471774024346e-07, + "logits/chosen": -0.1554158627986908, + "logits/rejected": -0.05729420855641365, + "logps/chosen": -4.0880632400512695, + "logps/rejected": -5.01497745513916, + "loss": 0.5179, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.0880632400512695, + "rewards/margins": 0.9269140958786011, + "rewards/rejected": -5.01497745513916, + "sft_loss": 4.160233020782471, + "step": 2715 + }, + { + "epoch": 1.4557618330824553, + "grad_norm": 14.711083719896562, + "learning_rate": 6.119544592446551e-07, + "logits/chosen": -0.08170298486948013, + "logits/rejected": 0.03274992108345032, + "logps/chosen": -4.147464752197266, + "logps/rejected": -4.928176403045654, + "loss": 0.5773, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.147464752197266, + "rewards/margins": 0.78071129322052, + "rewards/rejected": -4.928176403045654, + "sft_loss": 4.148330211639404, + "step": 2720 + }, + { + "epoch": 1.4584378658638568, + "grad_norm": 20.865595547653733, + "learning_rate": 6.104360582823096e-07, + "logits/chosen": -0.0227263942360878, + "logits/rejected": 0.05186920240521431, + "logps/chosen": -4.170312881469727, + "logps/rejected": -5.016467094421387, + "loss": 0.5475, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.170312881469727, + "rewards/margins": 0.8461543321609497, + "rewards/rejected": -5.016467094421387, + "sft_loss": 4.219161033630371, + "step": 2725 + }, + { + "epoch": 1.4611138986452583, + "grad_norm": 17.644714739641714, + "learning_rate": 6.089165858688423e-07, + "logits/chosen": -0.05663248151540756, + "logits/rejected": 0.08762215077877045, + "logps/chosen": -4.0606608390808105, + "logps/rejected": -5.053631782531738, + "loss": 0.5428, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.0606608390808105, + "rewards/margins": 0.9929712414741516, + "rewards/rejected": -5.053631782531738, + "sft_loss": 4.119710445404053, + "step": 2730 + }, + { + "epoch": 1.46378993142666, + "grad_norm": 12.892354129619571, + "learning_rate": 6.073960567461811e-07, + "logits/chosen": -0.025140320882201195, + "logits/rejected": 0.13027231395244598, + "logps/chosen": -3.8592910766601562, + "logps/rejected": -4.957719802856445, + "loss": 0.4691, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.8592910766601562, + "rewards/margins": 1.0984289646148682, + "rewards/rejected": -4.957719802856445, + "sft_loss": 3.962966203689575, + "step": 2735 + }, + { + "epoch": 1.4664659642080615, + "grad_norm": 14.579312955072842, + "learning_rate": 6.058744856665065e-07, + "logits/chosen": -0.08685266226530075, + "logits/rejected": -0.0053533404134213924, + "logps/chosen": -4.053783416748047, + "logps/rejected": -5.094172477722168, + "loss": 0.5043, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.053783416748047, + "rewards/margins": 1.0403884649276733, + "rewards/rejected": -5.094172477722168, + "sft_loss": 4.1353960037231445, + "step": 2740 + }, + { + "epoch": 1.469141996989463, + "grad_norm": 16.287696529068228, + "learning_rate": 6.043518873921074e-07, + "logits/chosen": -0.05991528183221817, + "logits/rejected": 0.06272322684526443, + "logps/chosen": -3.9453110694885254, + "logps/rejected": -4.837578773498535, + "loss": 0.5177, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.9453110694885254, + "rewards/margins": 0.8922680020332336, + "rewards/rejected": -4.837578773498535, + "sft_loss": 3.940615177154541, + "step": 2745 + }, + { + "epoch": 1.4718180297708647, + "grad_norm": 20.80843033969584, + "learning_rate": 6.028282766952393e-07, + "logits/chosen": -0.06943833827972412, + "logits/rejected": 0.028343399986624718, + "logps/chosen": -4.107184410095215, + "logps/rejected": -5.092995643615723, + "loss": 0.5261, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.107184410095215, + "rewards/margins": 0.9858118891716003, + "rewards/rejected": -5.092995643615723, + "sft_loss": 4.021084785461426, + "step": 2750 + }, + { + "epoch": 1.4744940625522662, + "grad_norm": 25.24141079264102, + "learning_rate": 6.013036683579798e-07, + "logits/chosen": -0.022642286494374275, + "logits/rejected": 0.06688891351222992, + "logps/chosen": -4.051756381988525, + "logps/rejected": -5.051907062530518, + "loss": 0.5089, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.051756381988525, + "rewards/margins": 1.0001503229141235, + "rewards/rejected": -5.051907062530518, + "sft_loss": 4.085324764251709, + "step": 2755 + }, + { + "epoch": 1.4771700953336677, + "grad_norm": 18.25910620581888, + "learning_rate": 5.997780771720854e-07, + "logits/chosen": -0.11980477720499039, + "logits/rejected": 0.024762999266386032, + "logps/chosen": -4.187039375305176, + "logps/rejected": -5.186661720275879, + "loss": 0.5251, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.187039375305176, + "rewards/margins": 0.9996216893196106, + "rewards/rejected": -5.186661720275879, + "sft_loss": 4.254944801330566, + "step": 2760 + }, + { + "epoch": 1.4798461281150694, + "grad_norm": 23.297284899305616, + "learning_rate": 5.982515179388486e-07, + "logits/chosen": 0.014777600765228271, + "logits/rejected": 0.12283160537481308, + "logps/chosen": -4.188802242279053, + "logps/rejected": -5.0527024269104, + "loss": 0.5634, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.188802242279053, + "rewards/margins": 0.8639005422592163, + "rewards/rejected": -5.0527024269104, + "sft_loss": 4.229619026184082, + "step": 2765 + }, + { + "epoch": 1.482522160896471, + "grad_norm": 15.362375581310394, + "learning_rate": 5.967240054689541e-07, + "logits/chosen": -0.08131209015846252, + "logits/rejected": -0.010910587385296822, + "logps/chosen": -4.099891662597656, + "logps/rejected": -5.010560035705566, + "loss": 0.5453, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.099891662597656, + "rewards/margins": 0.9106694459915161, + "rewards/rejected": -5.010560035705566, + "sft_loss": 4.198235511779785, + "step": 2770 + }, + { + "epoch": 1.4851981936778724, + "grad_norm": 18.285294232389813, + "learning_rate": 5.951955545823342e-07, + "logits/chosen": -0.021242473274469376, + "logits/rejected": 0.047610148787498474, + "logps/chosen": -4.324985504150391, + "logps/rejected": -5.272888660430908, + "loss": 0.5546, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.324985504150391, + "rewards/margins": 0.9479031562805176, + "rewards/rejected": -5.272888660430908, + "sft_loss": 4.410521507263184, + "step": 2775 + }, + { + "epoch": 1.4878742264592741, + "grad_norm": 14.359212704553558, + "learning_rate": 5.936661801080263e-07, + "logits/chosen": -0.0435919351875782, + "logits/rejected": 0.043762434273958206, + "logps/chosen": -4.3246612548828125, + "logps/rejected": -5.148694038391113, + "loss": 0.5852, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.3246612548828125, + "rewards/margins": 0.824032187461853, + "rewards/rejected": -5.148694038391113, + "sft_loss": 4.269598960876465, + "step": 2780 + }, + { + "epoch": 1.4905502592406756, + "grad_norm": 14.576563616406881, + "learning_rate": 5.92135896884028e-07, + "logits/chosen": -0.05614321678876877, + "logits/rejected": 0.0751347541809082, + "logps/chosen": -4.400944709777832, + "logps/rejected": -5.452885627746582, + "loss": 0.5033, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.400944709777832, + "rewards/margins": 1.0519407987594604, + "rewards/rejected": -5.452885627746582, + "sft_loss": 4.369610786437988, + "step": 2785 + }, + { + "epoch": 1.4932262920220774, + "grad_norm": 20.812593709518424, + "learning_rate": 5.906047197571541e-07, + "logits/chosen": -0.05414440482854843, + "logits/rejected": -0.05643627047538757, + "logps/chosen": -4.343169689178467, + "logps/rejected": -5.196185111999512, + "loss": 0.5817, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.343169689178467, + "rewards/margins": 0.8530157208442688, + "rewards/rejected": -5.196185111999512, + "sft_loss": 4.5007429122924805, + "step": 2790 + }, + { + "epoch": 1.4959023248034788, + "grad_norm": 15.582274519884054, + "learning_rate": 5.890726635828919e-07, + "logits/chosen": 0.011871114373207092, + "logits/rejected": 0.017706722021102905, + "logps/chosen": -4.198437690734863, + "logps/rejected": -5.182080268859863, + "loss": 0.5529, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.198437690734863, + "rewards/margins": 0.9836423993110657, + "rewards/rejected": -5.182080268859863, + "sft_loss": 4.292941093444824, + "step": 2795 + }, + { + "epoch": 1.4985783575848803, + "grad_norm": 17.999513649508582, + "learning_rate": 5.875397432252569e-07, + "logits/chosen": -0.06686706840991974, + "logits/rejected": 0.0006932914257049561, + "logps/chosen": -4.182742595672607, + "logps/rejected": -5.087394714355469, + "loss": 0.5393, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.182742595672607, + "rewards/margins": 0.9046524167060852, + "rewards/rejected": -5.087394714355469, + "sft_loss": 4.188513278961182, + "step": 2800 + }, + { + "epoch": 1.4985783575848803, + "eval_logits/chosen": 0.2569299042224884, + "eval_logits/rejected": 0.3460070788860321, + "eval_logps/chosen": -4.325809955596924, + "eval_logps/rejected": -5.222593784332275, + "eval_loss": 0.5736204981803894, + "eval_rewards/accuracies": 0.7255192995071411, + "eval_rewards/chosen": -4.325809955596924, + "eval_rewards/margins": 0.8967837691307068, + "eval_rewards/rejected": -5.222593784332275, + "eval_runtime": 43.14, + "eval_samples_per_second": 31.178, + "eval_sft_loss": 4.350619316101074, + "eval_steps_per_second": 7.812, + "step": 2800 + }, + { + "epoch": 1.5012543903662818, + "grad_norm": 14.121410420431141, + "learning_rate": 5.860059735566491e-07, + "logits/chosen": -0.13204196095466614, + "logits/rejected": 0.014378545805811882, + "logps/chosen": -4.165450096130371, + "logps/rejected": -5.127333164215088, + "loss": 0.5343, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.165450096130371, + "rewards/margins": 0.9618828892707825, + "rewards/rejected": -5.127333164215088, + "sft_loss": 4.18618106842041, + "step": 2805 + }, + { + "epoch": 1.5039304231476835, + "grad_norm": 19.46324147029196, + "learning_rate": 5.844713694577087e-07, + "logits/chosen": -0.035107698291540146, + "logits/rejected": 0.040776319801807404, + "logps/chosen": -4.294785022735596, + "logps/rejected": -5.255988121032715, + "loss": 0.5356, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.294785022735596, + "rewards/margins": 0.9612032771110535, + "rewards/rejected": -5.255988121032715, + "sft_loss": 4.381821632385254, + "step": 2810 + }, + { + "epoch": 1.5066064559290853, + "grad_norm": 14.028530499360638, + "learning_rate": 5.829359458171714e-07, + "logits/chosen": 0.010254351422190666, + "logits/rejected": 0.11219009011983871, + "logps/chosen": -4.247211456298828, + "logps/rejected": -5.452017307281494, + "loss": 0.4634, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.247211456298828, + "rewards/margins": 1.2048051357269287, + "rewards/rejected": -5.452017307281494, + "sft_loss": 4.285406112670898, + "step": 2815 + }, + { + "epoch": 1.5092824887104868, + "grad_norm": 16.72202278142961, + "learning_rate": 5.81399717531724e-07, + "logits/chosen": -0.039016492664813995, + "logits/rejected": 0.13155677914619446, + "logps/chosen": -4.415073394775391, + "logps/rejected": -5.258774757385254, + "loss": 0.5975, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.415073394775391, + "rewards/margins": 0.8437017202377319, + "rewards/rejected": -5.258774757385254, + "sft_loss": 4.524782657623291, + "step": 2820 + }, + { + "epoch": 1.5119585214918883, + "grad_norm": 16.689398458973308, + "learning_rate": 5.798626995058602e-07, + "logits/chosen": -0.10748390853404999, + "logits/rejected": 0.059959232807159424, + "logps/chosen": -4.438462734222412, + "logps/rejected": -5.426039695739746, + "loss": 0.5321, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.438462734222412, + "rewards/margins": 0.9875761866569519, + "rewards/rejected": -5.426039695739746, + "sft_loss": 4.467665195465088, + "step": 2825 + }, + { + "epoch": 1.51463455427329, + "grad_norm": 14.3991656602208, + "learning_rate": 5.783249066517354e-07, + "logits/chosen": -0.08068076521158218, + "logits/rejected": 0.029455062001943588, + "logps/chosen": -4.187173843383789, + "logps/rejected": -5.219720363616943, + "loss": 0.513, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.187173843383789, + "rewards/margins": 1.032546043395996, + "rewards/rejected": -5.219720363616943, + "sft_loss": 4.1944260597229, + "step": 2830 + }, + { + "epoch": 1.5173105870546915, + "grad_norm": 21.016670098652725, + "learning_rate": 5.767863538890228e-07, + "logits/chosen": -0.06701847165822983, + "logits/rejected": 0.07092493772506714, + "logps/chosen": -4.410538196563721, + "logps/rejected": -5.553727149963379, + "loss": 0.4838, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.410538196563721, + "rewards/margins": 1.1431890726089478, + "rewards/rejected": -5.553727149963379, + "sft_loss": 4.4488654136657715, + "step": 2835 + }, + { + "epoch": 1.519986619836093, + "grad_norm": 19.997525903684426, + "learning_rate": 5.75247056144768e-07, + "logits/chosen": -0.07559563219547272, + "logits/rejected": -0.018185529857873917, + "logps/chosen": -4.372363090515137, + "logps/rejected": -5.206852912902832, + "loss": 0.6082, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.372363090515137, + "rewards/margins": 0.8344897031784058, + "rewards/rejected": -5.206852912902832, + "sft_loss": 4.432497978210449, + "step": 2840 + }, + { + "epoch": 1.5226626526174947, + "grad_norm": 18.233783403550085, + "learning_rate": 5.737070283532444e-07, + "logits/chosen": 0.005519936792552471, + "logits/rejected": 0.06497209519147873, + "logps/chosen": -4.373464584350586, + "logps/rejected": -5.253887176513672, + "loss": 0.6108, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.373464584350586, + "rewards/margins": 0.8804221153259277, + "rewards/rejected": -5.253887176513672, + "sft_loss": 4.2910027503967285, + "step": 2845 + }, + { + "epoch": 1.5253386853988962, + "grad_norm": 15.533337761409932, + "learning_rate": 5.721662854558084e-07, + "logits/chosen": -0.05558420345187187, + "logits/rejected": 0.037349000573158264, + "logps/chosen": -4.265897750854492, + "logps/rejected": -5.312526226043701, + "loss": 0.5117, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.265897750854492, + "rewards/margins": 1.046628713607788, + "rewards/rejected": -5.312526226043701, + "sft_loss": 4.260438442230225, + "step": 2850 + }, + { + "epoch": 1.5280147181802977, + "grad_norm": 15.660868862531949, + "learning_rate": 5.706248424007545e-07, + "logits/chosen": -0.1164829283952713, + "logits/rejected": 0.04899804666638374, + "logps/chosen": -4.255067825317383, + "logps/rejected": -5.125923156738281, + "loss": 0.5641, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.255067825317383, + "rewards/margins": 0.8708555102348328, + "rewards/rejected": -5.125923156738281, + "sft_loss": 4.302035331726074, + "step": 2855 + }, + { + "epoch": 1.5306907509616994, + "grad_norm": 19.828532059494787, + "learning_rate": 5.690827141431699e-07, + "logits/chosen": -0.12655727565288544, + "logits/rejected": 0.047389380633831024, + "logps/chosen": -4.236365795135498, + "logps/rejected": -5.012572288513184, + "loss": 0.559, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.236365795135498, + "rewards/margins": 0.776206910610199, + "rewards/rejected": -5.012572288513184, + "sft_loss": 4.229098320007324, + "step": 2860 + }, + { + "epoch": 1.5333667837431009, + "grad_norm": 21.900113408790784, + "learning_rate": 5.675399156447897e-07, + "logits/chosen": -0.11166088283061981, + "logits/rejected": -0.0014119551051408052, + "logps/chosen": -4.32798433303833, + "logps/rejected": -5.046576499938965, + "loss": 0.6016, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.32798433303833, + "rewards/margins": 0.7185924649238586, + "rewards/rejected": -5.046576499938965, + "sft_loss": 4.378843784332275, + "step": 2865 + }, + { + "epoch": 1.5360428165245024, + "grad_norm": 17.076733572048365, + "learning_rate": 5.659964618738515e-07, + "logits/chosen": -0.06736056506633759, + "logits/rejected": 0.03894350677728653, + "logps/chosen": -4.179556846618652, + "logps/rejected": -5.024938583374023, + "loss": 0.5693, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.179556846618652, + "rewards/margins": 0.8453825116157532, + "rewards/rejected": -5.024938583374023, + "sft_loss": 4.135404109954834, + "step": 2870 + }, + { + "epoch": 1.538718849305904, + "grad_norm": 18.572646119062384, + "learning_rate": 5.644523678049509e-07, + "logits/chosen": -0.08097034692764282, + "logits/rejected": 0.03127686679363251, + "logps/chosen": -4.0773797035217285, + "logps/rejected": -4.944389820098877, + "loss": 0.5427, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.0773797035217285, + "rewards/margins": 0.867010772228241, + "rewards/rejected": -4.944389820098877, + "sft_loss": 4.0974225997924805, + "step": 2875 + }, + { + "epoch": 1.5413948820873056, + "grad_norm": 15.435970154352313, + "learning_rate": 5.629076484188952e-07, + "logits/chosen": 0.0010766386985778809, + "logits/rejected": 0.09980317950248718, + "logps/chosen": -3.981600284576416, + "logps/rejected": -4.928833961486816, + "loss": 0.5176, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.981600284576416, + "rewards/margins": 0.9472335577011108, + "rewards/rejected": -4.928833961486816, + "sft_loss": 4.042459011077881, + "step": 2880 + }, + { + "epoch": 1.544070914868707, + "grad_norm": 16.972547805466146, + "learning_rate": 5.613623187025587e-07, + "logits/chosen": -0.06025050953030586, + "logits/rejected": 0.05759192630648613, + "logps/chosen": -4.010976314544678, + "logps/rejected": -4.981903076171875, + "loss": 0.5245, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.010976314544678, + "rewards/margins": 0.9709264636039734, + "rewards/rejected": -4.981903076171875, + "sft_loss": 4.036219596862793, + "step": 2885 + }, + { + "epoch": 1.5467469476501088, + "grad_norm": 15.801610924913678, + "learning_rate": 5.598163936487369e-07, + "logits/chosen": -0.10592453181743622, + "logits/rejected": 0.04858104884624481, + "logps/chosen": -4.068356037139893, + "logps/rejected": -5.140035152435303, + "loss": 0.5061, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.068356037139893, + "rewards/margins": 1.0716798305511475, + "rewards/rejected": -5.140035152435303, + "sft_loss": 4.047433376312256, + "step": 2890 + }, + { + "epoch": 1.5494229804315103, + "grad_norm": 17.01761497153551, + "learning_rate": 5.582698882560017e-07, + "logits/chosen": -0.060587383806705475, + "logits/rejected": 0.06671730428934097, + "logps/chosen": -4.050535678863525, + "logps/rejected": -4.982683181762695, + "loss": 0.5429, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.050535678863525, + "rewards/margins": 0.9321478009223938, + "rewards/rejected": -4.982683181762695, + "sft_loss": 4.0685577392578125, + "step": 2895 + }, + { + "epoch": 1.5520990132129118, + "grad_norm": 15.744499216863726, + "learning_rate": 5.567228175285549e-07, + "logits/chosen": -0.02025572769343853, + "logits/rejected": 0.0811544805765152, + "logps/chosen": -4.085963726043701, + "logps/rejected": -5.134238243103027, + "loss": 0.4891, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.085963726043701, + "rewards/margins": 1.0482741594314575, + "rewards/rejected": -5.134238243103027, + "sft_loss": 4.075760841369629, + "step": 2900 + }, + { + "epoch": 1.5547750459943135, + "grad_norm": 18.491593244869883, + "learning_rate": 5.551751964760838e-07, + "logits/chosen": 0.0378764346241951, + "logits/rejected": 0.06506892293691635, + "logps/chosen": -4.117175102233887, + "logps/rejected": -5.12311315536499, + "loss": 0.5095, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.117175102233887, + "rewards/margins": 1.0059373378753662, + "rewards/rejected": -5.12311315536499, + "sft_loss": 4.185202598571777, + "step": 2905 + }, + { + "epoch": 1.557451078775715, + "grad_norm": 23.070900818143077, + "learning_rate": 5.536270401136145e-07, + "logits/chosen": -0.073255255818367, + "logits/rejected": 0.017674123868346214, + "logps/chosen": -4.214170932769775, + "logps/rejected": -5.127688884735107, + "loss": 0.5452, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.214170932769775, + "rewards/margins": 0.9135181307792664, + "rewards/rejected": -5.127688884735107, + "sft_loss": 4.309821605682373, + "step": 2910 + }, + { + "epoch": 1.5601271115571165, + "grad_norm": 23.941503621141138, + "learning_rate": 5.520783634613667e-07, + "logits/chosen": -0.02814924158155918, + "logits/rejected": 0.14941860735416412, + "logps/chosen": -4.352537155151367, + "logps/rejected": -5.3630266189575195, + "loss": 0.5536, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.352537155151367, + "rewards/margins": 1.0104889869689941, + "rewards/rejected": -5.3630266189575195, + "sft_loss": 4.485054969787598, + "step": 2915 + }, + { + "epoch": 1.5628031443385182, + "grad_norm": 21.75450442026472, + "learning_rate": 5.505291815446082e-07, + "logits/chosen": -0.05518549680709839, + "logits/rejected": 0.041710685938596725, + "logps/chosen": -4.426823139190674, + "logps/rejected": -5.402629375457764, + "loss": 0.5684, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.426823139190674, + "rewards/margins": 0.9758065938949585, + "rewards/rejected": -5.402629375457764, + "sft_loss": 4.535459041595459, + "step": 2920 + }, + { + "epoch": 1.5654791771199197, + "grad_norm": 19.47160041089576, + "learning_rate": 5.489795093935089e-07, + "logits/chosen": -0.007011513225734234, + "logits/rejected": 0.0902138352394104, + "logps/chosen": -4.3132548332214355, + "logps/rejected": -5.257037162780762, + "loss": 0.559, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.3132548332214355, + "rewards/margins": 0.9437819719314575, + "rewards/rejected": -5.257037162780762, + "sft_loss": 4.44458532333374, + "step": 2925 + }, + { + "epoch": 1.5681552099013212, + "grad_norm": 16.69099412236686, + "learning_rate": 5.474293620429946e-07, + "logits/chosen": -0.1297324001789093, + "logits/rejected": 0.018423620611429214, + "logps/chosen": -4.200294494628906, + "logps/rejected": -5.5349555015563965, + "loss": 0.4807, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.200294494628906, + "rewards/margins": 1.3346607685089111, + "rewards/rejected": -5.5349555015563965, + "sft_loss": 4.3675408363342285, + "step": 2930 + }, + { + "epoch": 1.570831242682723, + "grad_norm": 17.542272466508237, + "learning_rate": 5.458787545326018e-07, + "logits/chosen": -0.07640831172466278, + "logits/rejected": 0.025533486157655716, + "logps/chosen": -4.482228755950928, + "logps/rejected": -5.5046586990356445, + "loss": 0.5226, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.482228755950928, + "rewards/margins": 1.022430181503296, + "rewards/rejected": -5.5046586990356445, + "sft_loss": 4.561526298522949, + "step": 2935 + }, + { + "epoch": 1.5735072754641244, + "grad_norm": 18.032181115430728, + "learning_rate": 5.443277019063311e-07, + "logits/chosen": -0.10058959573507309, + "logits/rejected": 0.06195586919784546, + "logps/chosen": -4.4972920417785645, + "logps/rejected": -5.676342964172363, + "loss": 0.5305, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.4972920417785645, + "rewards/margins": 1.1790510416030884, + "rewards/rejected": -5.676342964172363, + "sft_loss": 4.620232582092285, + "step": 2940 + }, + { + "epoch": 1.5761833082455259, + "grad_norm": 26.202395628359643, + "learning_rate": 5.427762192125023e-07, + "logits/chosen": -0.039661847054958344, + "logits/rejected": 0.07111156731843948, + "logps/chosen": -4.454710960388184, + "logps/rejected": -5.439589023590088, + "loss": 0.5548, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.454710960388184, + "rewards/margins": 0.9848777055740356, + "rewards/rejected": -5.439589023590088, + "sft_loss": 4.483078956604004, + "step": 2945 + }, + { + "epoch": 1.5788593410269276, + "grad_norm": 19.59733889441095, + "learning_rate": 5.41224321503607e-07, + "logits/chosen": -0.05810501426458359, + "logits/rejected": 0.1393880695104599, + "logps/chosen": -4.310373306274414, + "logps/rejected": -5.385622501373291, + "loss": 0.4906, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.310373306274414, + "rewards/margins": 1.0752495527267456, + "rewards/rejected": -5.385622501373291, + "sft_loss": 4.3996477127075195, + "step": 2950 + }, + { + "epoch": 1.5815353738083293, + "grad_norm": 22.137994578470998, + "learning_rate": 5.396720238361637e-07, + "logits/chosen": -0.016352495178580284, + "logits/rejected": 0.0893014445900917, + "logps/chosen": -4.396595478057861, + "logps/rejected": -5.309807777404785, + "loss": 0.5674, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.396595478057861, + "rewards/margins": 0.9132122993469238, + "rewards/rejected": -5.309807777404785, + "sft_loss": 4.540419101715088, + "step": 2955 + }, + { + "epoch": 1.5842114065897306, + "grad_norm": 14.24671095972288, + "learning_rate": 5.381193412705711e-07, + "logits/chosen": -0.11190303415060043, + "logits/rejected": 0.024014584720134735, + "logps/chosen": -4.204039096832275, + "logps/rejected": -5.1999311447143555, + "loss": 0.4986, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.204039096832275, + "rewards/margins": 0.9958921670913696, + "rewards/rejected": -5.1999311447143555, + "sft_loss": 4.269665718078613, + "step": 2960 + }, + { + "epoch": 1.5868874393711323, + "grad_norm": 17.965126451922142, + "learning_rate": 5.365662888709622e-07, + "logits/chosen": -0.09304491430521011, + "logits/rejected": 0.029250601306557655, + "logps/chosen": -4.340491771697998, + "logps/rejected": -5.416264057159424, + "loss": 0.5196, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.340491771697998, + "rewards/margins": 1.0757719278335571, + "rewards/rejected": -5.416264057159424, + "sft_loss": 4.462465763092041, + "step": 2965 + }, + { + "epoch": 1.589563472152534, + "grad_norm": 18.49642126158975, + "learning_rate": 5.350128817050585e-07, + "logits/chosen": -0.11003684997558594, + "logits/rejected": 0.03975386545062065, + "logps/chosen": -4.333467483520508, + "logps/rejected": -5.315537452697754, + "loss": 0.5351, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.333467483520508, + "rewards/margins": 0.9820694923400879, + "rewards/rejected": -5.315537452697754, + "sft_loss": 4.381396293640137, + "step": 2970 + }, + { + "epoch": 1.5922395049339353, + "grad_norm": 22.539386691478214, + "learning_rate": 5.334591348440229e-07, + "logits/chosen": -0.0853196457028389, + "logits/rejected": 0.05843697860836983, + "logps/chosen": -4.250127792358398, + "logps/rejected": -5.1255669593811035, + "loss": 0.5554, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.250127792358398, + "rewards/margins": 0.8754390478134155, + "rewards/rejected": -5.1255669593811035, + "sft_loss": 4.269168853759766, + "step": 2975 + }, + { + "epoch": 1.594915537715337, + "grad_norm": 14.453356313159919, + "learning_rate": 5.319050633623141e-07, + "logits/chosen": -0.113250732421875, + "logits/rejected": 0.03299799561500549, + "logps/chosen": -4.246047496795654, + "logps/rejected": -5.113133430480957, + "loss": 0.5179, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.246047496795654, + "rewards/margins": 0.8670861124992371, + "rewards/rejected": -5.113133430480957, + "sft_loss": 4.2835211753845215, + "step": 2980 + }, + { + "epoch": 1.5975915704967387, + "grad_norm": 22.56856552556845, + "learning_rate": 5.303506823375409e-07, + "logits/chosen": -0.09829998016357422, + "logits/rejected": 0.08882582932710648, + "logps/chosen": -4.22990608215332, + "logps/rejected": -5.405078887939453, + "loss": 0.5154, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.22990608215332, + "rewards/margins": 1.1751729249954224, + "rewards/rejected": -5.405078887939453, + "sft_loss": 4.287569046020508, + "step": 2985 + }, + { + "epoch": 1.60026760327814, + "grad_norm": 15.64837650021201, + "learning_rate": 5.287960068503143e-07, + "logits/chosen": -0.11544670164585114, + "logits/rejected": 0.04846612364053726, + "logps/chosen": -4.176478385925293, + "logps/rejected": -5.209988117218018, + "loss": 0.498, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.176478385925293, + "rewards/margins": 1.0335088968276978, + "rewards/rejected": -5.209988117218018, + "sft_loss": 4.232048034667969, + "step": 2990 + }, + { + "epoch": 1.6029436360595417, + "grad_norm": 20.047395770434285, + "learning_rate": 5.272410519841032e-07, + "logits/chosen": -0.0673823282122612, + "logits/rejected": 0.05862687900662422, + "logps/chosen": -4.264988899230957, + "logps/rejected": -5.426611423492432, + "loss": 0.5053, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.264988899230957, + "rewards/margins": 1.1616226434707642, + "rewards/rejected": -5.426611423492432, + "sft_loss": 4.324214935302734, + "step": 2995 + }, + { + "epoch": 1.6056196688409434, + "grad_norm": 12.420600674643524, + "learning_rate": 5.256858328250861e-07, + "logits/chosen": -0.08667416870594025, + "logits/rejected": 0.07295207679271698, + "logps/chosen": -4.2573137283325195, + "logps/rejected": -5.147179126739502, + "loss": 0.5647, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.2573137283325195, + "rewards/margins": 0.8898651003837585, + "rewards/rejected": -5.147179126739502, + "sft_loss": 4.249060153961182, + "step": 3000 + }, + { + "epoch": 1.608295701622345, + "grad_norm": 34.817428653201006, + "learning_rate": 5.241303644620063e-07, + "logits/chosen": -0.17020973563194275, + "logits/rejected": -0.02446267567574978, + "logps/chosen": -4.2911906242370605, + "logps/rejected": -5.05643892288208, + "loss": 0.6009, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.2911906242370605, + "rewards/margins": 0.7652486562728882, + "rewards/rejected": -5.05643892288208, + "sft_loss": 4.294539451599121, + "step": 3005 + }, + { + "epoch": 1.6109717344037464, + "grad_norm": 25.68838864785532, + "learning_rate": 5.225746619860248e-07, + "logits/chosen": -0.13014598190784454, + "logits/rejected": -0.024837497621774673, + "logps/chosen": -4.2041826248168945, + "logps/rejected": -5.022336006164551, + "loss": 0.6079, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.2041826248168945, + "rewards/margins": 0.8181535005569458, + "rewards/rejected": -5.022336006164551, + "sft_loss": 4.219718933105469, + "step": 3010 + }, + { + "epoch": 1.6136477671851481, + "grad_norm": 21.011750396811554, + "learning_rate": 5.210187404905735e-07, + "logits/chosen": 0.0073079378344118595, + "logits/rejected": 0.06562227755784988, + "logps/chosen": -4.269503593444824, + "logps/rejected": -5.181615352630615, + "loss": 0.5409, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.269503593444824, + "rewards/margins": 0.912111759185791, + "rewards/rejected": -5.181615352630615, + "sft_loss": 4.28148889541626, + "step": 3015 + }, + { + "epoch": 1.6163237999665496, + "grad_norm": 14.804538548601718, + "learning_rate": 5.194626150712098e-07, + "logits/chosen": -0.07849368453025818, + "logits/rejected": 0.01196741871535778, + "logps/chosen": -4.193101406097412, + "logps/rejected": -4.992875099182129, + "loss": 0.5668, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.193101406097412, + "rewards/margins": 0.7997739315032959, + "rewards/rejected": -4.992875099182129, + "sft_loss": 4.28323221206665, + "step": 3020 + }, + { + "epoch": 1.6189998327479511, + "grad_norm": 15.709663615878485, + "learning_rate": 5.179063008254695e-07, + "logits/chosen": -0.11467244476079941, + "logits/rejected": 0.026135023683309555, + "logps/chosen": -4.037603855133057, + "logps/rejected": -4.962246894836426, + "loss": 0.5525, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.037603855133057, + "rewards/margins": 0.9246425628662109, + "rewards/rejected": -4.962246894836426, + "sft_loss": 4.146195411682129, + "step": 3025 + }, + { + "epoch": 1.6216758655293528, + "grad_norm": 18.153743062770673, + "learning_rate": 5.163498128527199e-07, + "logits/chosen": -0.049486517906188965, + "logits/rejected": 0.07292356342077255, + "logps/chosen": -4.202414512634277, + "logps/rejected": -5.069171905517578, + "loss": 0.5718, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.202414512634277, + "rewards/margins": 0.866757869720459, + "rewards/rejected": -5.069171905517578, + "sft_loss": 4.199142932891846, + "step": 3030 + }, + { + "epoch": 1.6243518983107543, + "grad_norm": 19.154269364694123, + "learning_rate": 5.147931662540144e-07, + "logits/chosen": -0.005182699766010046, + "logits/rejected": 0.10528840124607086, + "logps/chosen": -4.11621618270874, + "logps/rejected": -4.913527488708496, + "loss": 0.5481, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.11621618270874, + "rewards/margins": 0.7973116636276245, + "rewards/rejected": -4.913527488708496, + "sft_loss": 4.1073455810546875, + "step": 3035 + }, + { + "epoch": 1.6270279310921558, + "grad_norm": 20.68341814608387, + "learning_rate": 5.132363761319449e-07, + "logits/chosen": -0.10306306183338165, + "logits/rejected": -0.03484489768743515, + "logps/chosen": -4.04842472076416, + "logps/rejected": -5.121028423309326, + "loss": 0.5202, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.04842472076416, + "rewards/margins": 1.0726039409637451, + "rewards/rejected": -5.121028423309326, + "sft_loss": 4.06124210357666, + "step": 3040 + }, + { + "epoch": 1.6297039638735575, + "grad_norm": 29.02963400980283, + "learning_rate": 5.116794575904962e-07, + "logits/chosen": -0.060677748173475266, + "logits/rejected": 0.045880045741796494, + "logps/chosen": -3.9956302642822266, + "logps/rejected": -4.8577165603637695, + "loss": 0.5514, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.9956302642822266, + "rewards/margins": 0.8620861768722534, + "rewards/rejected": -4.8577165603637695, + "sft_loss": 4.059117317199707, + "step": 3045 + }, + { + "epoch": 1.632379996654959, + "grad_norm": 13.869042600567854, + "learning_rate": 5.101224257348987e-07, + "logits/chosen": -0.07129235565662384, + "logits/rejected": 0.05507662147283554, + "logps/chosen": -4.214221954345703, + "logps/rejected": -5.314478397369385, + "loss": 0.4822, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.214221954345703, + "rewards/margins": 1.1002566814422607, + "rewards/rejected": -5.314478397369385, + "sft_loss": 4.279477596282959, + "step": 3050 + }, + { + "epoch": 1.6350560294363605, + "grad_norm": 16.093245713923835, + "learning_rate": 5.085652956714823e-07, + "logits/chosen": -0.11817373335361481, + "logits/rejected": 0.01352071762084961, + "logps/chosen": -4.390268325805664, + "logps/rejected": -5.223195552825928, + "loss": 0.5693, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.390268325805664, + "rewards/margins": 0.8329275250434875, + "rewards/rejected": -5.223195552825928, + "sft_loss": 4.474215507507324, + "step": 3055 + }, + { + "epoch": 1.6377320622177622, + "grad_norm": 17.82188236509395, + "learning_rate": 5.070080825075298e-07, + "logits/chosen": -0.07951287925243378, + "logits/rejected": 0.07106980681419373, + "logps/chosen": -4.244257926940918, + "logps/rejected": -5.166312217712402, + "loss": 0.5795, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.244257926940918, + "rewards/margins": 0.9220544695854187, + "rewards/rejected": -5.166312217712402, + "sft_loss": 4.341263294219971, + "step": 3060 + }, + { + "epoch": 1.6404080949991637, + "grad_norm": 19.49501064735196, + "learning_rate": 5.0545080135113e-07, + "logits/chosen": -0.05764093995094299, + "logits/rejected": 0.01515720784664154, + "logps/chosen": -4.19266939163208, + "logps/rejected": -5.072085380554199, + "loss": 0.5839, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.19266939163208, + "rewards/margins": 0.8794158101081848, + "rewards/rejected": -5.072085380554199, + "sft_loss": 4.263916015625, + "step": 3065 + }, + { + "epoch": 1.6430841277805652, + "grad_norm": 23.97253984730568, + "learning_rate": 5.038934673110316e-07, + "logits/chosen": -0.11488769948482513, + "logits/rejected": -0.014919767156243324, + "logps/chosen": -4.181607246398926, + "logps/rejected": -5.144347190856934, + "loss": 0.5614, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.181607246398926, + "rewards/margins": 0.9627391695976257, + "rewards/rejected": -5.144347190856934, + "sft_loss": 4.271900653839111, + "step": 3070 + }, + { + "epoch": 1.645760160561967, + "grad_norm": 19.499945514414925, + "learning_rate": 5.023360954964963e-07, + "logits/chosen": -0.11179818958044052, + "logits/rejected": -0.02782334014773369, + "logps/chosen": -4.056575298309326, + "logps/rejected": -4.9648356437683105, + "loss": 0.5052, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.056575298309326, + "rewards/margins": 0.9082603454589844, + "rewards/rejected": -4.9648356437683105, + "sft_loss": 4.106502532958984, + "step": 3075 + }, + { + "epoch": 1.6484361933433684, + "grad_norm": 15.566938456996795, + "learning_rate": 5.007787010171524e-07, + "logits/chosen": -0.15935197472572327, + "logits/rejected": -0.006320520304143429, + "logps/chosen": -3.8887240886688232, + "logps/rejected": -4.9238152503967285, + "loss": 0.4759, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.8887240886688232, + "rewards/margins": 1.035091757774353, + "rewards/rejected": -4.9238152503967285, + "sft_loss": 3.980847120285034, + "step": 3080 + }, + { + "epoch": 1.65111222612477, + "grad_norm": 18.645578002041546, + "learning_rate": 4.992212989828477e-07, + "logits/chosen": -0.04259735345840454, + "logits/rejected": -0.012678694911301136, + "logps/chosen": -4.119019985198975, + "logps/rejected": -4.899888038635254, + "loss": 0.5709, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.119019985198975, + "rewards/margins": 0.7808682322502136, + "rewards/rejected": -4.899888038635254, + "sft_loss": 4.167031288146973, + "step": 3085 + }, + { + "epoch": 1.6537882589061716, + "grad_norm": 21.47625387200884, + "learning_rate": 4.976639045035036e-07, + "logits/chosen": -0.05810439586639404, + "logits/rejected": 0.004169926047325134, + "logps/chosen": -4.052678108215332, + "logps/rejected": -4.817889213562012, + "loss": 0.6303, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.052678108215332, + "rewards/margins": 0.7652114629745483, + "rewards/rejected": -4.817889213562012, + "sft_loss": 4.158685684204102, + "step": 3090 + }, + { + "epoch": 1.6564642916875731, + "grad_norm": 19.301067077983053, + "learning_rate": 4.961065326889683e-07, + "logits/chosen": -0.06106138974428177, + "logits/rejected": 0.04724999517202377, + "logps/chosen": -4.15666389465332, + "logps/rejected": -4.9704060554504395, + "loss": 0.5664, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.15666389465332, + "rewards/margins": 0.8137421607971191, + "rewards/rejected": -4.9704060554504395, + "sft_loss": 4.189563751220703, + "step": 3095 + }, + { + "epoch": 1.6591403244689746, + "grad_norm": 18.182766248763908, + "learning_rate": 4.9454919864887e-07, + "logits/chosen": -0.16863279044628143, + "logits/rejected": -0.05148882791399956, + "logps/chosen": -4.012679576873779, + "logps/rejected": -4.946879863739014, + "loss": 0.5421, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.012679576873779, + "rewards/margins": 0.9341999292373657, + "rewards/rejected": -4.946879863739014, + "sft_loss": 4.170138359069824, + "step": 3100 + }, + { + "epoch": 1.6618163572503764, + "grad_norm": 19.05104896048302, + "learning_rate": 4.929919174924701e-07, + "logits/chosen": -0.15180808305740356, + "logits/rejected": -0.02108634077012539, + "logps/chosen": -3.9979186058044434, + "logps/rejected": -4.824479103088379, + "loss": 0.5475, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.9979186058044434, + "rewards/margins": 0.8265607953071594, + "rewards/rejected": -4.824479103088379, + "sft_loss": 4.137550354003906, + "step": 3105 + }, + { + "epoch": 1.6644923900317778, + "grad_norm": 15.934164660261775, + "learning_rate": 4.914347043285177e-07, + "logits/chosen": -0.08919097483158112, + "logits/rejected": 0.017639994621276855, + "logps/chosen": -4.064725875854492, + "logps/rejected": -5.006443500518799, + "loss": 0.516, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.064725875854492, + "rewards/margins": 0.9417168498039246, + "rewards/rejected": -5.006443500518799, + "sft_loss": 4.004981994628906, + "step": 3110 + }, + { + "epoch": 1.6671684228131793, + "grad_norm": 17.991808063740628, + "learning_rate": 4.898775742651013e-07, + "logits/chosen": -0.08161447197198868, + "logits/rejected": 0.01697484403848648, + "logps/chosen": -4.060294151306152, + "logps/rejected": -5.105404376983643, + "loss": 0.4824, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.060294151306152, + "rewards/margins": 1.0451104640960693, + "rewards/rejected": -5.105404376983643, + "sft_loss": 4.122883319854736, + "step": 3115 + }, + { + "epoch": 1.669844455594581, + "grad_norm": 12.433180956157578, + "learning_rate": 4.883205424095037e-07, + "logits/chosen": -0.17363722622394562, + "logits/rejected": -0.05253412574529648, + "logps/chosen": -4.116326332092285, + "logps/rejected": -5.152121067047119, + "loss": 0.505, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.116326332092285, + "rewards/margins": 1.0357939004898071, + "rewards/rejected": -5.152121067047119, + "sft_loss": 4.167025566101074, + "step": 3120 + }, + { + "epoch": 1.6725204883759828, + "grad_norm": 17.926724117264285, + "learning_rate": 4.86763623868055e-07, + "logits/chosen": -0.11365020275115967, + "logits/rejected": -0.02593514882028103, + "logps/chosen": -4.264039039611816, + "logps/rejected": -5.186556816101074, + "loss": 0.5399, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.264039039611816, + "rewards/margins": 0.9225172996520996, + "rewards/rejected": -5.186556816101074, + "sft_loss": 4.256202697753906, + "step": 3125 + }, + { + "epoch": 1.675196521157384, + "grad_norm": 21.535233328316657, + "learning_rate": 4.852068337459856e-07, + "logits/chosen": -0.0966586321592331, + "logits/rejected": 0.0101277781650424, + "logps/chosen": -4.302945137023926, + "logps/rejected": -5.152838230133057, + "loss": 0.5384, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.302945137023926, + "rewards/margins": 0.8498929738998413, + "rewards/rejected": -5.152838230133057, + "sft_loss": 4.336602210998535, + "step": 3130 + }, + { + "epoch": 1.6778725539387858, + "grad_norm": 19.348539479077594, + "learning_rate": 4.8365018714728e-07, + "logits/chosen": -0.04772127419710159, + "logits/rejected": 0.019659971818327904, + "logps/chosen": -4.417842864990234, + "logps/rejected": -5.282958984375, + "loss": 0.5443, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.417842864990234, + "rewards/margins": 0.8651165962219238, + "rewards/rejected": -5.282958984375, + "sft_loss": 4.4471940994262695, + "step": 3135 + }, + { + "epoch": 1.6805485867201875, + "grad_norm": 23.45924489258539, + "learning_rate": 4.820936991745304e-07, + "logits/chosen": -0.22262386977672577, + "logits/rejected": -0.12868838012218475, + "logps/chosen": -4.260302543640137, + "logps/rejected": -5.084171772003174, + "loss": 0.549, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.260302543640137, + "rewards/margins": 0.8238682746887207, + "rewards/rejected": -5.084171772003174, + "sft_loss": 4.291085720062256, + "step": 3140 + }, + { + "epoch": 1.6832246195015887, + "grad_norm": 19.19538120962937, + "learning_rate": 4.8053738492879e-07, + "logits/chosen": -0.0920620784163475, + "logits/rejected": 0.010093789547681808, + "logps/chosen": -4.094933986663818, + "logps/rejected": -5.171164512634277, + "loss": 0.4992, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.094933986663818, + "rewards/margins": 1.0762296915054321, + "rewards/rejected": -5.171164512634277, + "sft_loss": 4.0767130851745605, + "step": 3145 + }, + { + "epoch": 1.6859006522829905, + "grad_norm": 22.70065476135362, + "learning_rate": 4.789812595094265e-07, + "logits/chosen": -0.1846626251935959, + "logits/rejected": -0.0954296737909317, + "logps/chosen": -4.166723728179932, + "logps/rejected": -5.224819183349609, + "loss": 0.497, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.166723728179932, + "rewards/margins": 1.0580958127975464, + "rewards/rejected": -5.224819183349609, + "sft_loss": 4.194915771484375, + "step": 3150 + }, + { + "epoch": 1.6885766850643922, + "grad_norm": 15.588264832781391, + "learning_rate": 4.774253380139752e-07, + "logits/chosen": -0.18257483839988708, + "logits/rejected": -0.06619922071695328, + "logps/chosen": -4.142666816711426, + "logps/rejected": -5.177772045135498, + "loss": 0.4982, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.142666816711426, + "rewards/margins": 1.0351049900054932, + "rewards/rejected": -5.177772045135498, + "sft_loss": 4.206771373748779, + "step": 3155 + }, + { + "epoch": 1.6912527178457935, + "grad_norm": 21.630102221903424, + "learning_rate": 4.758696355379936e-07, + "logits/chosen": -0.10724954307079315, + "logits/rejected": -0.12234711647033691, + "logps/chosen": -4.20287561416626, + "logps/rejected": -5.214568138122559, + "loss": 0.5072, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.20287561416626, + "rewards/margins": 1.0116924047470093, + "rewards/rejected": -5.214568138122559, + "sft_loss": 4.358765602111816, + "step": 3160 + }, + { + "epoch": 1.6939287506271952, + "grad_norm": 20.033921886742174, + "learning_rate": 4.743141671749138e-07, + "logits/chosen": -0.2289537489414215, + "logits/rejected": -0.10383538901805878, + "logps/chosen": -4.348308563232422, + "logps/rejected": -5.09984016418457, + "loss": 0.6191, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.348308563232422, + "rewards/margins": 0.7515309453010559, + "rewards/rejected": -5.09984016418457, + "sft_loss": 4.463630199432373, + "step": 3165 + }, + { + "epoch": 1.6966047834085969, + "grad_norm": 17.134255878271137, + "learning_rate": 4.727589480158968e-07, + "logits/chosen": -0.11890985071659088, + "logits/rejected": -0.046903759241104126, + "logps/chosen": -4.308679103851318, + "logps/rejected": -5.342099189758301, + "loss": 0.5077, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.308679103851318, + "rewards/margins": 1.0334198474884033, + "rewards/rejected": -5.342099189758301, + "sft_loss": 4.364779472351074, + "step": 3170 + }, + { + "epoch": 1.6992808161899984, + "grad_norm": 23.161647576779664, + "learning_rate": 4.712039931496855e-07, + "logits/chosen": -0.16019386053085327, + "logits/rejected": -0.0389467254281044, + "logps/chosen": -4.390309810638428, + "logps/rejected": -5.0757060050964355, + "loss": 0.6471, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.390309810638428, + "rewards/margins": 0.685396134853363, + "rewards/rejected": -5.0757060050964355, + "sft_loss": 4.451899528503418, + "step": 3175 + }, + { + "epoch": 1.7019568489713999, + "grad_norm": 16.909814275556286, + "learning_rate": 4.6964931766245905e-07, + "logits/chosen": -0.045469336211681366, + "logits/rejected": 0.010172396898269653, + "logps/chosen": -4.298305511474609, + "logps/rejected": -5.328028678894043, + "loss": 0.5117, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.298305511474609, + "rewards/margins": 1.029723882675171, + "rewards/rejected": -5.328028678894043, + "sft_loss": 4.3115234375, + "step": 3180 + }, + { + "epoch": 1.7046328817528016, + "grad_norm": 20.0691172417075, + "learning_rate": 4.6809493663768575e-07, + "logits/chosen": -0.11708267033100128, + "logits/rejected": -0.07965598255395889, + "logps/chosen": -4.334473133087158, + "logps/rejected": -4.946959018707275, + "loss": 0.6378, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.334473133087158, + "rewards/margins": 0.6124860644340515, + "rewards/rejected": -4.946959018707275, + "sft_loss": 4.341203689575195, + "step": 3185 + }, + { + "epoch": 1.707308914534203, + "grad_norm": 14.375996759345945, + "learning_rate": 4.6654086515597716e-07, + "logits/chosen": -0.17368324100971222, + "logits/rejected": -0.048559121787548065, + "logps/chosen": -4.239495277404785, + "logps/rejected": -5.280592441558838, + "loss": 0.5036, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.239495277404785, + "rewards/margins": 1.0410963296890259, + "rewards/rejected": -5.280592441558838, + "sft_loss": 4.296621799468994, + "step": 3190 + }, + { + "epoch": 1.7099849473156046, + "grad_norm": 15.178072771537483, + "learning_rate": 4.6498711829494154e-07, + "logits/chosen": -0.17286744713783264, + "logits/rejected": -0.07282562553882599, + "logps/chosen": -4.2494306564331055, + "logps/rejected": -5.157985687255859, + "loss": 0.5495, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.2494306564331055, + "rewards/margins": 0.9085548520088196, + "rewards/rejected": -5.157985687255859, + "sft_loss": 4.245641231536865, + "step": 3195 + }, + { + "epoch": 1.7126609800970063, + "grad_norm": 17.146266923159065, + "learning_rate": 4.6343371112903777e-07, + "logits/chosen": -0.07511468231678009, + "logits/rejected": 0.02954399213194847, + "logps/chosen": -4.374552249908447, + "logps/rejected": -5.253323078155518, + "loss": 0.5981, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.374552249908447, + "rewards/margins": 0.8787716031074524, + "rewards/rejected": -5.253323078155518, + "sft_loss": 4.383390426635742, + "step": 3200 + }, + { + "epoch": 1.7126609800970063, + "eval_logits/chosen": 0.1183885782957077, + "eval_logits/rejected": 0.19278573989868164, + "eval_logps/chosen": -4.256961822509766, + "eval_logps/rejected": -5.173403739929199, + "eval_loss": 0.5694618225097656, + "eval_rewards/accuracies": 0.7270029783248901, + "eval_rewards/chosen": -4.256961822509766, + "eval_rewards/margins": 0.9164420366287231, + "eval_rewards/rejected": -5.173403739929199, + "eval_runtime": 43.1761, + "eval_samples_per_second": 31.152, + "eval_sft_loss": 4.277877330780029, + "eval_steps_per_second": 7.805, + "step": 3200 + }, + { + "epoch": 1.7153370128784078, + "grad_norm": 13.739952321881674, + "learning_rate": 4.618806587294291e-07, + "logits/chosen": -0.18275292217731476, + "logits/rejected": -0.08163726329803467, + "logps/chosen": -4.217829704284668, + "logps/rejected": -5.261086940765381, + "loss": 0.5194, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.217829704284668, + "rewards/margins": 1.0432568788528442, + "rewards/rejected": -5.261086940765381, + "sft_loss": 4.270330429077148, + "step": 3205 + }, + { + "epoch": 1.7180130456598093, + "grad_norm": 21.115462789349642, + "learning_rate": 4.603279761638365e-07, + "logits/chosen": -0.1848531812429428, + "logits/rejected": -0.0920737236738205, + "logps/chosen": -4.141186237335205, + "logps/rejected": -4.958422660827637, + "loss": 0.6006, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.141186237335205, + "rewards/margins": 0.8172367215156555, + "rewards/rejected": -4.958422660827637, + "sft_loss": 4.176085472106934, + "step": 3210 + }, + { + "epoch": 1.720689078441211, + "grad_norm": 17.527792344129246, + "learning_rate": 4.5877567849639315e-07, + "logits/chosen": -0.08552250266075134, + "logits/rejected": -0.0013067282270640135, + "logps/chosen": -4.216039657592773, + "logps/rejected": -5.193731307983398, + "loss": 0.5323, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.216039657592773, + "rewards/margins": 0.9776918292045593, + "rewards/rejected": -5.193731307983398, + "sft_loss": 4.2045440673828125, + "step": 3215 + }, + { + "epoch": 1.7233651112226125, + "grad_norm": 16.395034924437546, + "learning_rate": 4.572237807874979e-07, + "logits/chosen": -0.13056273758411407, + "logits/rejected": 0.05236934870481491, + "logps/chosen": -4.405067443847656, + "logps/rejected": -5.288483619689941, + "loss": 0.6144, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.405067443847656, + "rewards/margins": 0.8834161758422852, + "rewards/rejected": -5.288483619689941, + "sft_loss": 4.329681396484375, + "step": 3220 + }, + { + "epoch": 1.726041144004014, + "grad_norm": 17.388519714863275, + "learning_rate": 4.5567229809366895e-07, + "logits/chosen": -0.09959354251623154, + "logits/rejected": 0.0017538412939757109, + "logps/chosen": -4.105210304260254, + "logps/rejected": -4.967142105102539, + "loss": 0.5609, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.105210304260254, + "rewards/margins": 0.8619323968887329, + "rewards/rejected": -4.967142105102539, + "sft_loss": 4.156649589538574, + "step": 3225 + }, + { + "epoch": 1.7287171767854157, + "grad_norm": 24.155114988120655, + "learning_rate": 4.541212454673984e-07, + "logits/chosen": -0.13659390807151794, + "logits/rejected": -0.026909206062555313, + "logps/chosen": -4.200619220733643, + "logps/rejected": -5.389188289642334, + "loss": 0.5156, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.200619220733643, + "rewards/margins": 1.1885693073272705, + "rewards/rejected": -5.389188289642334, + "sft_loss": 4.245565891265869, + "step": 3230 + }, + { + "epoch": 1.7313932095668172, + "grad_norm": 19.98274438302079, + "learning_rate": 4.525706379570055e-07, + "logits/chosen": -0.10791487991809845, + "logits/rejected": -0.030868541449308395, + "logps/chosen": -4.1000776290893555, + "logps/rejected": -5.038941383361816, + "loss": 0.5404, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.1000776290893555, + "rewards/margins": 0.9388639330863953, + "rewards/rejected": -5.038941383361816, + "sft_loss": 4.160983562469482, + "step": 3235 + }, + { + "epoch": 1.7340692423482187, + "grad_norm": 16.181202230471833, + "learning_rate": 4.510204906064911e-07, + "logits/chosen": -0.09710139781236649, + "logits/rejected": -0.011764958500862122, + "logps/chosen": -4.07378625869751, + "logps/rejected": -5.1709818840026855, + "loss": 0.4991, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.07378625869751, + "rewards/margins": 1.0971953868865967, + "rewards/rejected": -5.1709818840026855, + "sft_loss": 4.0234856605529785, + "step": 3240 + }, + { + "epoch": 1.7367452751296204, + "grad_norm": 19.089885220258388, + "learning_rate": 4.4947081845539177e-07, + "logits/chosen": -0.2048162966966629, + "logits/rejected": -0.12096717208623886, + "logps/chosen": -4.21279239654541, + "logps/rejected": -5.121549606323242, + "loss": 0.5607, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.21279239654541, + "rewards/margins": 0.9087567329406738, + "rewards/rejected": -5.121549606323242, + "sft_loss": 4.211321830749512, + "step": 3245 + }, + { + "epoch": 1.739421307911022, + "grad_norm": 16.273027199291345, + "learning_rate": 4.479216365386333e-07, + "logits/chosen": -0.08466310799121857, + "logits/rejected": 0.045869845896959305, + "logps/chosen": -4.143354892730713, + "logps/rejected": -5.182480812072754, + "loss": 0.5185, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.143354892730713, + "rewards/margins": 1.0391263961791992, + "rewards/rejected": -5.182480812072754, + "sft_loss": 4.090240001678467, + "step": 3250 + }, + { + "epoch": 1.7420973406924234, + "grad_norm": 13.724951195026941, + "learning_rate": 4.4637295988638555e-07, + "logits/chosen": -0.09971622377634048, + "logits/rejected": -0.024418365210294724, + "logps/chosen": -4.057034492492676, + "logps/rejected": -5.0171966552734375, + "loss": 0.5262, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.057034492492676, + "rewards/margins": 0.9601619839668274, + "rewards/rejected": -5.0171966552734375, + "sft_loss": 4.123260974884033, + "step": 3255 + }, + { + "epoch": 1.744773373473825, + "grad_norm": 20.907854592456673, + "learning_rate": 4.4482480352391623e-07, + "logits/chosen": -0.2061711847782135, + "logits/rejected": -0.08672699332237244, + "logps/chosen": -4.1651105880737305, + "logps/rejected": -5.111360549926758, + "loss": 0.5392, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.1651105880737305, + "rewards/margins": 0.9462505578994751, + "rewards/rejected": -5.111360549926758, + "sft_loss": 4.181136608123779, + "step": 3260 + }, + { + "epoch": 1.7474494062552266, + "grad_norm": 23.650308827928992, + "learning_rate": 4.4327718247144507e-07, + "logits/chosen": -0.11200736463069916, + "logits/rejected": -0.010071463882923126, + "logps/chosen": -4.173468589782715, + "logps/rejected": -5.128382682800293, + "loss": 0.5304, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.173468589782715, + "rewards/margins": 0.9549140930175781, + "rewards/rejected": -5.128382682800293, + "sft_loss": 4.241128444671631, + "step": 3265 + }, + { + "epoch": 1.750125439036628, + "grad_norm": 22.109028806365686, + "learning_rate": 4.417301117439984e-07, + "logits/chosen": -0.09953314810991287, + "logits/rejected": -0.010271935723721981, + "logps/chosen": -4.216373920440674, + "logps/rejected": -5.176015853881836, + "loss": 0.5498, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.216373920440674, + "rewards/margins": 0.9596416354179382, + "rewards/rejected": -5.176015853881836, + "sft_loss": 4.237557411193848, + "step": 3270 + }, + { + "epoch": 1.7528014718180298, + "grad_norm": 19.706781989174747, + "learning_rate": 4.401836063512631e-07, + "logits/chosen": -0.1606660634279251, + "logits/rejected": 0.051355648785829544, + "logps/chosen": -4.122914791107178, + "logps/rejected": -5.148131370544434, + "loss": 0.5219, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.122914791107178, + "rewards/margins": 1.0252161026000977, + "rewards/rejected": -5.148131370544434, + "sft_loss": 4.13117790222168, + "step": 3275 + }, + { + "epoch": 1.7554775045994313, + "grad_norm": 21.56296392336687, + "learning_rate": 4.386376812974413e-07, + "logits/chosen": -0.1537935584783554, + "logits/rejected": -0.09401834011077881, + "logps/chosen": -4.051243782043457, + "logps/rejected": -5.064203262329102, + "loss": 0.5357, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.051243782043457, + "rewards/margins": 1.0129592418670654, + "rewards/rejected": -5.064203262329102, + "sft_loss": 4.121866226196289, + "step": 3280 + }, + { + "epoch": 1.7581535373808328, + "grad_norm": 17.711612391458424, + "learning_rate": 4.370923515811048e-07, + "logits/chosen": -0.16775628924369812, + "logits/rejected": -0.014869133941829205, + "logps/chosen": -4.1766228675842285, + "logps/rejected": -5.212665557861328, + "loss": 0.5093, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.1766228675842285, + "rewards/margins": 1.0360426902770996, + "rewards/rejected": -5.212665557861328, + "sft_loss": 4.168249130249023, + "step": 3285 + }, + { + "epoch": 1.7608295701622345, + "grad_norm": 13.87655714330121, + "learning_rate": 4.35547632195049e-07, + "logits/chosen": -0.12899205088615417, + "logits/rejected": -0.0327068492770195, + "logps/chosen": -4.141658782958984, + "logps/rejected": -5.138970375061035, + "loss": 0.4945, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.141658782958984, + "rewards/margins": 0.9973119497299194, + "rewards/rejected": -5.138970375061035, + "sft_loss": 4.159889221191406, + "step": 3290 + }, + { + "epoch": 1.763505602943636, + "grad_norm": 19.220576359739102, + "learning_rate": 4.340035381261484e-07, + "logits/chosen": -0.15308910608291626, + "logits/rejected": -0.07327961176633835, + "logps/chosen": -4.265786647796631, + "logps/rejected": -5.289236068725586, + "loss": 0.5383, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.265786647796631, + "rewards/margins": 1.023449182510376, + "rewards/rejected": -5.289236068725586, + "sft_loss": 4.193299770355225, + "step": 3295 + }, + { + "epoch": 1.7661816357250375, + "grad_norm": 20.46061855557512, + "learning_rate": 4.324600843552104e-07, + "logits/chosen": -0.20100100338459015, + "logits/rejected": -0.08928118646144867, + "logps/chosen": -4.368012428283691, + "logps/rejected": -5.361285209655762, + "loss": 0.5534, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.368012428283691, + "rewards/margins": 0.9932721853256226, + "rewards/rejected": -5.361285209655762, + "sft_loss": 4.412505149841309, + "step": 3300 + }, + { + "epoch": 1.7688576685064392, + "grad_norm": 21.193931033259123, + "learning_rate": 4.309172858568302e-07, + "logits/chosen": -0.2275840938091278, + "logits/rejected": -0.08153820037841797, + "logps/chosen": -4.363230228424072, + "logps/rejected": -5.38161563873291, + "loss": 0.5327, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.363230228424072, + "rewards/margins": 1.0183861255645752, + "rewards/rejected": -5.38161563873291, + "sft_loss": 4.3429274559021, + "step": 3305 + }, + { + "epoch": 1.771533701287841, + "grad_norm": 22.862480463300454, + "learning_rate": 4.293751575992455e-07, + "logits/chosen": -0.08414838463068008, + "logits/rejected": -0.03929399698972702, + "logps/chosen": -4.420595645904541, + "logps/rejected": -5.371197700500488, + "loss": 0.5192, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.420595645904541, + "rewards/margins": 0.9506022334098816, + "rewards/rejected": -5.371197700500488, + "sft_loss": 4.414549827575684, + "step": 3310 + }, + { + "epoch": 1.7742097340692422, + "grad_norm": 29.09289354380726, + "learning_rate": 4.278337145441916e-07, + "logits/chosen": -0.16041435301303864, + "logits/rejected": -0.0331757515668869, + "logps/chosen": -4.369056701660156, + "logps/rejected": -5.273859977722168, + "loss": 0.5428, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.369056701660156, + "rewards/margins": 0.9048035740852356, + "rewards/rejected": -5.273859977722168, + "sft_loss": 4.352242946624756, + "step": 3315 + }, + { + "epoch": 1.776885766850644, + "grad_norm": 14.15545369611903, + "learning_rate": 4.262929716467556e-07, + "logits/chosen": -0.13385829329490662, + "logits/rejected": 0.025810521095991135, + "logps/chosen": -4.274466514587402, + "logps/rejected": -5.444076061248779, + "loss": 0.5061, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.274466514587402, + "rewards/margins": 1.1696093082427979, + "rewards/rejected": -5.444076061248779, + "sft_loss": 4.29061222076416, + "step": 3320 + }, + { + "epoch": 1.7795617996320456, + "grad_norm": 17.259932516111828, + "learning_rate": 4.247529438552321e-07, + "logits/chosen": -0.14594252407550812, + "logits/rejected": -0.017102601006627083, + "logps/chosen": -4.262748718261719, + "logps/rejected": -5.186878204345703, + "loss": 0.559, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.262748718261719, + "rewards/margins": 0.9241297841072083, + "rewards/rejected": -5.186878204345703, + "sft_loss": 4.339690685272217, + "step": 3325 + }, + { + "epoch": 1.782237832413447, + "grad_norm": 19.17161374595759, + "learning_rate": 4.232136461109773e-07, + "logits/chosen": -0.12570686638355255, + "logits/rejected": -0.044585347175598145, + "logps/chosen": -4.1848015785217285, + "logps/rejected": -5.268584251403809, + "loss": 0.5124, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.1848015785217285, + "rewards/margins": 1.0837829113006592, + "rewards/rejected": -5.268584251403809, + "sft_loss": 4.249958515167236, + "step": 3330 + }, + { + "epoch": 1.7849138651948486, + "grad_norm": 23.22537086801625, + "learning_rate": 4.216750933482646e-07, + "logits/chosen": -0.155071422457695, + "logits/rejected": -0.020992886275053024, + "logps/chosen": -4.429194450378418, + "logps/rejected": -5.286048889160156, + "loss": 0.5677, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.429194450378418, + "rewards/margins": 0.8568543195724487, + "rewards/rejected": -5.286048889160156, + "sft_loss": 4.3752641677856445, + "step": 3335 + }, + { + "epoch": 1.7875898979762503, + "grad_norm": 39.54595794572819, + "learning_rate": 4.2013730049413986e-07, + "logits/chosen": -0.14056840538978577, + "logits/rejected": -0.041243575513362885, + "logps/chosen": -4.156649112701416, + "logps/rejected": -5.233036994934082, + "loss": 0.5217, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.156649112701416, + "rewards/margins": 1.0763883590698242, + "rewards/rejected": -5.233036994934082, + "sft_loss": 4.208165645599365, + "step": 3340 + }, + { + "epoch": 1.7902659307576518, + "grad_norm": 15.607691512733764, + "learning_rate": 4.1860028246827594e-07, + "logits/chosen": -0.1249227300286293, + "logits/rejected": 0.013692038133740425, + "logps/chosen": -4.091392993927002, + "logps/rejected": -5.043036460876465, + "loss": 0.5274, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.091392993927002, + "rewards/margins": 0.9516437649726868, + "rewards/rejected": -5.043036460876465, + "sft_loss": 4.152836799621582, + "step": 3345 + }, + { + "epoch": 1.7929419635390533, + "grad_norm": 17.824995315805673, + "learning_rate": 4.170640541828285e-07, + "logits/chosen": -0.2082880288362503, + "logits/rejected": -0.11089960485696793, + "logps/chosen": -4.208667755126953, + "logps/rejected": -5.084951877593994, + "loss": 0.5505, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.208667755126953, + "rewards/margins": 0.8762838244438171, + "rewards/rejected": -5.084951877593994, + "sft_loss": 4.241647243499756, + "step": 3350 + }, + { + "epoch": 1.795617996320455, + "grad_norm": 22.978605746214726, + "learning_rate": 4.1552863054229116e-07, + "logits/chosen": -0.06792887300252914, + "logits/rejected": -0.021539511159062386, + "logps/chosen": -4.2966156005859375, + "logps/rejected": -5.186400413513184, + "loss": 0.5906, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.2966156005859375, + "rewards/margins": 0.8897849917411804, + "rewards/rejected": -5.186400413513184, + "sft_loss": 4.250180244445801, + "step": 3355 + }, + { + "epoch": 1.7982940291018565, + "grad_norm": 17.4446702090662, + "learning_rate": 4.139940264433508e-07, + "logits/chosen": -0.17651358246803284, + "logits/rejected": -0.05707705765962601, + "logps/chosen": -3.9511427879333496, + "logps/rejected": -5.0004706382751465, + "loss": 0.5096, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.9511427879333496, + "rewards/margins": 1.0493286848068237, + "rewards/rejected": -5.0004706382751465, + "sft_loss": 3.935319185256958, + "step": 3360 + }, + { + "epoch": 1.800970061883258, + "grad_norm": 15.88468673012, + "learning_rate": 4.1246025677474303e-07, + "logits/chosen": -0.17244072258472443, + "logits/rejected": -0.03228010609745979, + "logps/chosen": -4.157092094421387, + "logps/rejected": -4.983794212341309, + "loss": 0.5546, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.157092094421387, + "rewards/margins": 0.8267024159431458, + "rewards/rejected": -4.983794212341309, + "sft_loss": 4.224639415740967, + "step": 3365 + }, + { + "epoch": 1.8036460946646597, + "grad_norm": 19.192023331560634, + "learning_rate": 4.10927336417108e-07, + "logits/chosen": -0.16004905104637146, + "logits/rejected": -0.058913685381412506, + "logps/chosen": -4.1700592041015625, + "logps/rejected": -4.868165493011475, + "loss": 0.6268, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.1700592041015625, + "rewards/margins": 0.6981061697006226, + "rewards/rejected": -4.868165493011475, + "sft_loss": 4.16310977935791, + "step": 3370 + }, + { + "epoch": 1.8063221274460612, + "grad_norm": 20.894293928782755, + "learning_rate": 4.093952802428457e-07, + "logits/chosen": -0.06739739328622818, + "logits/rejected": -0.001216635457240045, + "logps/chosen": -4.290366172790527, + "logps/rejected": -5.070502281188965, + "loss": 0.6411, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.290366172790527, + "rewards/margins": 0.7801357507705688, + "rewards/rejected": -5.070502281188965, + "sft_loss": 4.278425693511963, + "step": 3375 + }, + { + "epoch": 1.8089981602274627, + "grad_norm": 15.495752358141823, + "learning_rate": 4.0786410311597184e-07, + "logits/chosen": -0.200698584318161, + "logits/rejected": -0.05970926955342293, + "logps/chosen": -4.080262660980225, + "logps/rejected": -4.968288421630859, + "loss": 0.555, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.080262660980225, + "rewards/margins": 0.888026237487793, + "rewards/rejected": -4.968288421630859, + "sft_loss": 3.9992785453796387, + "step": 3380 + }, + { + "epoch": 1.8116741930088645, + "grad_norm": 14.9598019996281, + "learning_rate": 4.063338198919737e-07, + "logits/chosen": -0.16302593052387238, + "logits/rejected": -0.14525838196277618, + "logps/chosen": -4.063436985015869, + "logps/rejected": -4.868227005004883, + "loss": 0.5737, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.063436985015869, + "rewards/margins": 0.8047906160354614, + "rewards/rejected": -4.868227005004883, + "sft_loss": 4.049282550811768, + "step": 3385 + }, + { + "epoch": 1.814350225790266, + "grad_norm": 24.824050343951967, + "learning_rate": 4.0480444541766575e-07, + "logits/chosen": -0.1567583680152893, + "logits/rejected": -0.05262039229273796, + "logps/chosen": -4.254889488220215, + "logps/rejected": -5.024016380310059, + "loss": 0.6236, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.254889488220215, + "rewards/margins": 0.7691268920898438, + "rewards/rejected": -5.024016380310059, + "sft_loss": 4.200462818145752, + "step": 3390 + }, + { + "epoch": 1.8170262585716674, + "grad_norm": 14.986770011000203, + "learning_rate": 4.0327599453104606e-07, + "logits/chosen": -0.20013757050037384, + "logits/rejected": -0.08344296365976334, + "logps/chosen": -4.004967212677002, + "logps/rejected": -5.059557914733887, + "loss": 0.49, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.004967212677002, + "rewards/margins": 1.0545909404754639, + "rewards/rejected": -5.059557914733887, + "sft_loss": 4.046624183654785, + "step": 3395 + }, + { + "epoch": 1.8197022913530692, + "grad_norm": 17.686613090286425, + "learning_rate": 4.017484820611514e-07, + "logits/chosen": -0.15689019858837128, + "logits/rejected": -0.055449776351451874, + "logps/chosen": -4.107612133026123, + "logps/rejected": -5.058051109313965, + "loss": 0.5275, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.107612133026123, + "rewards/margins": 0.9504392743110657, + "rewards/rejected": -5.058051109313965, + "sft_loss": 4.0827226638793945, + "step": 3400 + }, + { + "epoch": 1.8223783241344707, + "grad_norm": 20.619625811533965, + "learning_rate": 4.002219228279148e-07, + "logits/chosen": -0.16872528195381165, + "logits/rejected": -0.0544901080429554, + "logps/chosen": -4.091769695281982, + "logps/rejected": -4.962288856506348, + "loss": 0.5235, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.091769695281982, + "rewards/margins": 0.8705183863639832, + "rewards/rejected": -4.962288856506348, + "sft_loss": 4.082983493804932, + "step": 3405 + }, + { + "epoch": 1.8250543569158721, + "grad_norm": 19.70934234192068, + "learning_rate": 3.9869633164202045e-07, + "logits/chosen": -0.13666436076164246, + "logits/rejected": 0.00520123029127717, + "logps/chosen": -4.1989336013793945, + "logps/rejected": -5.079361438751221, + "loss": 0.5343, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.1989336013793945, + "rewards/margins": 0.880427360534668, + "rewards/rejected": -5.079361438751221, + "sft_loss": 4.141883850097656, + "step": 3410 + }, + { + "epoch": 1.8277303896972739, + "grad_norm": 24.19864006279397, + "learning_rate": 3.9717172330476077e-07, + "logits/chosen": -0.1475699245929718, + "logits/rejected": -0.05400124937295914, + "logps/chosen": -4.168356895446777, + "logps/rejected": -5.142523765563965, + "loss": 0.553, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.168356895446777, + "rewards/margins": 0.9741671681404114, + "rewards/rejected": -5.142523765563965, + "sft_loss": 4.242750644683838, + "step": 3415 + }, + { + "epoch": 1.8304064224786754, + "grad_norm": 17.59114134719486, + "learning_rate": 3.956481126078927e-07, + "logits/chosen": -0.05430952459573746, + "logits/rejected": 0.03733807057142258, + "logps/chosen": -4.2969841957092285, + "logps/rejected": -5.275949478149414, + "loss": 0.5866, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.2969841957092285, + "rewards/margins": 0.9789649844169617, + "rewards/rejected": -5.275949478149414, + "sft_loss": 4.335600852966309, + "step": 3420 + }, + { + "epoch": 1.8330824552600768, + "grad_norm": 17.66037474695137, + "learning_rate": 3.941255143334937e-07, + "logits/chosen": -0.15076126158237457, + "logits/rejected": -0.10337958484888077, + "logps/chosen": -4.24198055267334, + "logps/rejected": -5.192962169647217, + "loss": 0.5445, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.24198055267334, + "rewards/margins": 0.9509812593460083, + "rewards/rejected": -5.192962169647217, + "sft_loss": 4.194026470184326, + "step": 3425 + }, + { + "epoch": 1.8357584880414786, + "grad_norm": 17.990633898393206, + "learning_rate": 3.9260394325381895e-07, + "logits/chosen": -0.12183426320552826, + "logits/rejected": -0.02019437588751316, + "logps/chosen": -4.105195045471191, + "logps/rejected": -5.314507961273193, + "loss": 0.4936, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.105195045471191, + "rewards/margins": 1.209313154220581, + "rewards/rejected": -5.314507961273193, + "sft_loss": 4.080663681030273, + "step": 3430 + }, + { + "epoch": 1.83843452082288, + "grad_norm": 20.832243914758877, + "learning_rate": 3.9108341413115784e-07, + "logits/chosen": -0.12777313590049744, + "logits/rejected": -0.029532218351960182, + "logps/chosen": -4.139020919799805, + "logps/rejected": -5.124658107757568, + "loss": 0.4992, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.139020919799805, + "rewards/margins": 0.9856374859809875, + "rewards/rejected": -5.124658107757568, + "sft_loss": 4.1206889152526855, + "step": 3435 + }, + { + "epoch": 1.8411105536042816, + "grad_norm": 19.749623232301467, + "learning_rate": 3.895639417176905e-07, + "logits/chosen": -0.15492835640907288, + "logits/rejected": -0.07339377701282501, + "logps/chosen": -4.259757041931152, + "logps/rejected": -5.20042085647583, + "loss": 0.5948, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.259757041931152, + "rewards/margins": 0.9406633377075195, + "rewards/rejected": -5.20042085647583, + "sft_loss": 4.314055442810059, + "step": 3440 + }, + { + "epoch": 1.8437865863856833, + "grad_norm": 21.03359954772715, + "learning_rate": 3.8804554075534497e-07, + "logits/chosen": -0.1447829157114029, + "logits/rejected": 0.0038142502307891846, + "logps/chosen": -4.160075664520264, + "logps/rejected": -5.173608779907227, + "loss": 0.533, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.160075664520264, + "rewards/margins": 1.013533353805542, + "rewards/rejected": -5.173608779907227, + "sft_loss": 4.226978302001953, + "step": 3445 + }, + { + "epoch": 1.8464626191670848, + "grad_norm": 16.25438738433027, + "learning_rate": 3.8652822597565403e-07, + "logits/chosen": -0.20611099898815155, + "logits/rejected": -0.06895715743303299, + "logps/chosen": -4.167816162109375, + "logps/rejected": -5.257494926452637, + "loss": 0.5047, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.167816162109375, + "rewards/margins": 1.089678406715393, + "rewards/rejected": -5.257494926452637, + "sft_loss": 4.187711715698242, + "step": 3450 + }, + { + "epoch": 1.8491386519484863, + "grad_norm": 21.90203147752327, + "learning_rate": 3.850120120996123e-07, + "logits/chosen": -0.12661200761795044, + "logits/rejected": 0.0346415713429451, + "logps/chosen": -4.3217668533325195, + "logps/rejected": -5.258645057678223, + "loss": 0.5709, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.3217668533325195, + "rewards/margins": 0.9368780255317688, + "rewards/rejected": -5.258645057678223, + "sft_loss": 4.307793140411377, + "step": 3455 + }, + { + "epoch": 1.851814684729888, + "grad_norm": 15.496954998388917, + "learning_rate": 3.8349691383753356e-07, + "logits/chosen": -0.04699116200208664, + "logits/rejected": 0.04914768785238266, + "logps/chosen": -4.097477912902832, + "logps/rejected": -5.124687194824219, + "loss": 0.526, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.097477912902832, + "rewards/margins": 1.0272096395492554, + "rewards/rejected": -5.124687194824219, + "sft_loss": 4.059412956237793, + "step": 3460 + }, + { + "epoch": 1.8544907175112895, + "grad_norm": 17.05974309769058, + "learning_rate": 3.819829458889078e-07, + "logits/chosen": -0.11163085699081421, + "logits/rejected": -0.026383381336927414, + "logps/chosen": -4.076812744140625, + "logps/rejected": -5.011106967926025, + "loss": 0.5498, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.076812744140625, + "rewards/margins": 0.9342945218086243, + "rewards/rejected": -5.011106967926025, + "sft_loss": 4.031030178070068, + "step": 3465 + }, + { + "epoch": 1.857166750292691, + "grad_norm": 17.020735837715733, + "learning_rate": 3.804701229422585e-07, + "logits/chosen": -0.16300682723522186, + "logits/rejected": -0.07162132114171982, + "logps/chosen": -4.137753009796143, + "logps/rejected": -5.187546730041504, + "loss": 0.5061, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.137753009796143, + "rewards/margins": 1.0497941970825195, + "rewards/rejected": -5.187546730041504, + "sft_loss": 4.171750545501709, + "step": 3470 + }, + { + "epoch": 1.8598427830740927, + "grad_norm": 20.478259568185432, + "learning_rate": 3.789584596750007e-07, + "logits/chosen": -0.12015841901302338, + "logits/rejected": -0.07270057499408722, + "logps/chosen": -4.1296234130859375, + "logps/rejected": -5.0713019371032715, + "loss": 0.5379, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.1296234130859375, + "rewards/margins": 0.9416790008544922, + "rewards/rejected": -5.0713019371032715, + "sft_loss": 4.139330863952637, + "step": 3475 + }, + { + "epoch": 1.8625188158554944, + "grad_norm": 19.231642583061976, + "learning_rate": 3.77447970753298e-07, + "logits/chosen": -0.0636029839515686, + "logits/rejected": -0.03613414242863655, + "logps/chosen": -4.1893086433410645, + "logps/rejected": -5.160582065582275, + "loss": 0.5445, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.1893086433410645, + "rewards/margins": 0.9712736010551453, + "rewards/rejected": -5.160582065582275, + "sft_loss": 4.223686695098877, + "step": 3480 + }, + { + "epoch": 1.8651948486368957, + "grad_norm": 20.455838066662558, + "learning_rate": 3.7593867083192057e-07, + "logits/chosen": -0.1425260305404663, + "logits/rejected": -0.016420168802142143, + "logps/chosen": -4.095873832702637, + "logps/rejected": -5.037788391113281, + "loss": 0.5451, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.095873832702637, + "rewards/margins": 0.9419152140617371, + "rewards/rejected": -5.037788391113281, + "sft_loss": 4.152099609375, + "step": 3485 + }, + { + "epoch": 1.8678708814182974, + "grad_norm": 20.59747876700388, + "learning_rate": 3.7443057455410276e-07, + "logits/chosen": -0.08216296136379242, + "logits/rejected": -0.01140972413122654, + "logps/chosen": -3.9835262298583984, + "logps/rejected": -5.026278018951416, + "loss": 0.4817, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.9835262298583984, + "rewards/margins": 1.0427509546279907, + "rewards/rejected": -5.026278018951416, + "sft_loss": 4.098433017730713, + "step": 3490 + }, + { + "epoch": 1.870546914199699, + "grad_norm": 15.707515872721908, + "learning_rate": 3.7292369655140145e-07, + "logits/chosen": -0.15804629027843475, + "logits/rejected": -0.008402202278375626, + "logps/chosen": -4.166210651397705, + "logps/rejected": -5.008130073547363, + "loss": 0.5235, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.166210651397705, + "rewards/margins": 0.8419189453125, + "rewards/rejected": -5.008130073547363, + "sft_loss": 4.215886116027832, + "step": 3495 + }, + { + "epoch": 1.8732229469811004, + "grad_norm": 17.958433949746656, + "learning_rate": 3.714180514435534e-07, + "logits/chosen": -0.08625032007694244, + "logits/rejected": 0.042555466294288635, + "logps/chosen": -4.064800262451172, + "logps/rejected": -5.0926899909973145, + "loss": 0.53, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.064800262451172, + "rewards/margins": 1.0278890132904053, + "rewards/rejected": -5.0926899909973145, + "sft_loss": 4.08921480178833, + "step": 3500 + }, + { + "epoch": 1.875898979762502, + "grad_norm": 23.990799056307704, + "learning_rate": 3.6991365383833426e-07, + "logits/chosen": -0.13713189959526062, + "logits/rejected": -0.024845337495207787, + "logps/chosen": -4.0670576095581055, + "logps/rejected": -5.089943885803223, + "loss": 0.5002, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.0670576095581055, + "rewards/margins": 1.0228854417800903, + "rewards/rejected": -5.089943885803223, + "sft_loss": 4.173640727996826, + "step": 3505 + }, + { + "epoch": 1.8785750125439038, + "grad_norm": 24.049200595342356, + "learning_rate": 3.684105183314162e-07, + "logits/chosen": -0.13475707173347473, + "logits/rejected": -0.054293811321258545, + "logps/chosen": -3.9485535621643066, + "logps/rejected": -4.891864776611328, + "loss": 0.5098, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.9485535621643066, + "rewards/margins": 0.9433116912841797, + "rewards/rejected": -4.891864776611328, + "sft_loss": 4.00014591217041, + "step": 3510 + }, + { + "epoch": 1.881251045325305, + "grad_norm": 27.11973407946868, + "learning_rate": 3.669086595062263e-07, + "logits/chosen": -0.13104240596294403, + "logits/rejected": 0.018518714234232903, + "logps/chosen": -4.24746561050415, + "logps/rejected": -5.167351245880127, + "loss": 0.5381, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.24746561050415, + "rewards/margins": 0.9198853373527527, + "rewards/rejected": -5.167351245880127, + "sft_loss": 4.26193380355835, + "step": 3515 + }, + { + "epoch": 1.8839270781067068, + "grad_norm": 19.658703047840522, + "learning_rate": 3.654080919338056e-07, + "logits/chosen": -0.15275278687477112, + "logits/rejected": -0.042724233120679855, + "logps/chosen": -4.083222389221191, + "logps/rejected": -5.0945048332214355, + "loss": 0.5277, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.083222389221191, + "rewards/margins": 1.01128351688385, + "rewards/rejected": -5.0945048332214355, + "sft_loss": 4.171942234039307, + "step": 3520 + }, + { + "epoch": 1.8866031108881085, + "grad_norm": 19.505670229478117, + "learning_rate": 3.639088301726673e-07, + "logits/chosen": -0.11327368021011353, + "logits/rejected": 0.04385864734649658, + "logps/chosen": -4.1887898445129395, + "logps/rejected": -5.175089359283447, + "loss": 0.538, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.1887898445129395, + "rewards/margins": 0.9862992167472839, + "rewards/rejected": -5.175089359283447, + "sft_loss": 4.245810031890869, + "step": 3525 + }, + { + "epoch": 1.88927914366951, + "grad_norm": 21.70040417144261, + "learning_rate": 3.624108887686556e-07, + "logits/chosen": -0.09647150337696075, + "logits/rejected": -0.030451273545622826, + "logps/chosen": -4.218400955200195, + "logps/rejected": -5.127978324890137, + "loss": 0.5215, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.218400955200195, + "rewards/margins": 0.9095777273178101, + "rewards/rejected": -5.127978324890137, + "sft_loss": 4.29921817779541, + "step": 3530 + }, + { + "epoch": 1.8919551764509115, + "grad_norm": 14.186563976709435, + "learning_rate": 3.6091428225480433e-07, + "logits/chosen": -0.15827597677707672, + "logits/rejected": -0.047300536185503006, + "logps/chosen": -4.203457355499268, + "logps/rejected": -5.193899631500244, + "loss": 0.5356, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.203457355499268, + "rewards/margins": 0.9904430508613586, + "rewards/rejected": -5.193899631500244, + "sft_loss": 4.272356986999512, + "step": 3535 + }, + { + "epoch": 1.8946312092323132, + "grad_norm": 23.872995455089058, + "learning_rate": 3.5941902515119674e-07, + "logits/chosen": -0.17173142731189728, + "logits/rejected": -0.015093426220119, + "logps/chosen": -4.212442874908447, + "logps/rejected": -5.019708633422852, + "loss": 0.5935, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.212442874908447, + "rewards/margins": 0.8072662353515625, + "rewards/rejected": -5.019708633422852, + "sft_loss": 4.270554065704346, + "step": 3540 + }, + { + "epoch": 1.8973072420137147, + "grad_norm": 21.46918673011061, + "learning_rate": 3.5792513196482373e-07, + "logits/chosen": -0.25270357728004456, + "logits/rejected": -0.05684171989560127, + "logps/chosen": -4.086533546447754, + "logps/rejected": -5.053985595703125, + "loss": 0.493, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.086533546447754, + "rewards/margins": 0.9674515724182129, + "rewards/rejected": -5.053985595703125, + "sft_loss": 4.029145240783691, + "step": 3545 + }, + { + "epoch": 1.8999832747951162, + "grad_norm": 22.333149628635958, + "learning_rate": 3.5643261718944346e-07, + "logits/chosen": -0.10256993770599365, + "logits/rejected": -0.02860853634774685, + "logps/chosen": -4.1633830070495605, + "logps/rejected": -4.950932502746582, + "loss": 0.598, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.1633830070495605, + "rewards/margins": 0.7875494360923767, + "rewards/rejected": -4.950932502746582, + "sft_loss": 4.042609691619873, + "step": 3550 + }, + { + "epoch": 1.902659307576518, + "grad_norm": 15.37692037295358, + "learning_rate": 3.5494149530544087e-07, + "logits/chosen": -0.185903400182724, + "logits/rejected": -0.11153104156255722, + "logps/chosen": -4.057262420654297, + "logps/rejected": -5.0266618728637695, + "loss": 0.5535, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.057262420654297, + "rewards/margins": 0.9693989753723145, + "rewards/rejected": -5.0266618728637695, + "sft_loss": 4.037737846374512, + "step": 3555 + }, + { + "epoch": 1.9053353403579194, + "grad_norm": 20.81202001686929, + "learning_rate": 3.534517807796871e-07, + "logits/chosen": -0.16729159653186798, + "logits/rejected": -0.08528953790664673, + "logps/chosen": -4.080422878265381, + "logps/rejected": -4.926032066345215, + "loss": 0.5463, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.080422878265381, + "rewards/margins": 0.8456095457077026, + "rewards/rejected": -4.926032066345215, + "sft_loss": 4.083813667297363, + "step": 3560 + }, + { + "epoch": 1.908011373139321, + "grad_norm": 15.231184553571023, + "learning_rate": 3.519634880653988e-07, + "logits/chosen": -0.1267562061548233, + "logits/rejected": -0.03941858932375908, + "logps/chosen": -4.1407952308654785, + "logps/rejected": -5.257883548736572, + "loss": 0.4893, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.1407952308654785, + "rewards/margins": 1.1170886754989624, + "rewards/rejected": -5.257883548736572, + "sft_loss": 4.170060634613037, + "step": 3565 + }, + { + "epoch": 1.9106874059207226, + "grad_norm": 16.097207489875522, + "learning_rate": 3.504766316019987e-07, + "logits/chosen": -0.1753147691488266, + "logits/rejected": -0.03046388551592827, + "logps/chosen": -3.917523145675659, + "logps/rejected": -4.898660182952881, + "loss": 0.501, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.917523145675659, + "rewards/margins": 0.9811370968818665, + "rewards/rejected": -4.898660182952881, + "sft_loss": 3.8685176372528076, + "step": 3570 + }, + { + "epoch": 1.913363438702124, + "grad_norm": 15.107710878871499, + "learning_rate": 3.489912258149745e-07, + "logits/chosen": -0.09951204061508179, + "logits/rejected": -0.008878534659743309, + "logps/chosen": -3.9246585369110107, + "logps/rejected": -4.985711097717285, + "loss": 0.5155, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.9246585369110107, + "rewards/margins": 1.0610524415969849, + "rewards/rejected": -4.985711097717285, + "sft_loss": 3.904801845550537, + "step": 3575 + }, + { + "epoch": 1.9160394714835256, + "grad_norm": 15.097741117008953, + "learning_rate": 3.475072851157397e-07, + "logits/chosen": -0.1479146033525467, + "logits/rejected": -0.09697943180799484, + "logps/chosen": -3.9680991172790527, + "logps/rejected": -4.9519877433776855, + "loss": 0.5071, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.9680991172790527, + "rewards/margins": 0.983887791633606, + "rewards/rejected": -4.9519877433776855, + "sft_loss": 4.009753704071045, + "step": 3580 + }, + { + "epoch": 1.9187155042649273, + "grad_norm": 16.027082366258657, + "learning_rate": 3.460248239014936e-07, + "logits/chosen": -0.09173516184091568, + "logits/rejected": -0.032550834119319916, + "logps/chosen": -4.174387454986572, + "logps/rejected": -5.171200752258301, + "loss": 0.5149, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.174387454986572, + "rewards/margins": 0.9968129992485046, + "rewards/rejected": -5.171200752258301, + "sft_loss": 4.2629289627075195, + "step": 3585 + }, + { + "epoch": 1.9213915370463288, + "grad_norm": 19.16035789184787, + "learning_rate": 3.4454385655508134e-07, + "logits/chosen": -0.09119559824466705, + "logits/rejected": -0.04381603002548218, + "logps/chosen": -4.111305236816406, + "logps/rejected": -4.907462120056152, + "loss": 0.5905, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.111305236816406, + "rewards/margins": 0.79615718126297, + "rewards/rejected": -4.907462120056152, + "sft_loss": 4.138059139251709, + "step": 3590 + }, + { + "epoch": 1.9240675698277303, + "grad_norm": 13.659664145902656, + "learning_rate": 3.4306439744485447e-07, + "logits/chosen": -0.1878870278596878, + "logits/rejected": -0.03675522282719612, + "logps/chosen": -4.168497562408447, + "logps/rejected": -5.069736480712891, + "loss": 0.5518, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.168497562408447, + "rewards/margins": 0.901239275932312, + "rewards/rejected": -5.069736480712891, + "sft_loss": 4.146317958831787, + "step": 3595 + }, + { + "epoch": 1.926743602609132, + "grad_norm": 19.47720756254695, + "learning_rate": 3.415864609245322e-07, + "logits/chosen": -0.11716008186340332, + "logits/rejected": 0.011091604828834534, + "logps/chosen": -4.178882122039795, + "logps/rejected": -5.08506965637207, + "loss": 0.5856, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.178882122039795, + "rewards/margins": 0.9061868786811829, + "rewards/rejected": -5.08506965637207, + "sft_loss": 4.272181510925293, + "step": 3600 + }, + { + "epoch": 1.926743602609132, + "eval_logits/chosen": 0.08892910927534103, + "eval_logits/rejected": 0.16326628625392914, + "eval_logps/chosen": -4.089351177215576, + "eval_logps/rejected": -4.974943161010742, + "eval_loss": 0.5677979588508606, + "eval_rewards/accuracies": 0.7336795330047607, + "eval_rewards/chosen": -4.089351177215576, + "eval_rewards/margins": 0.885591983795166, + "eval_rewards/rejected": -4.974943161010742, + "eval_runtime": 43.2056, + "eval_samples_per_second": 31.13, + "eval_sft_loss": 4.112879276275635, + "eval_steps_per_second": 7.8, + "step": 3600 + }, + { + "epoch": 1.9294196353905335, + "grad_norm": 18.243432365338094, + "learning_rate": 3.401100613330605e-07, + "logits/chosen": -0.15508165955543518, + "logits/rejected": -0.1456497758626938, + "logps/chosen": -3.9924635887145996, + "logps/rejected": -4.866345405578613, + "loss": 0.543, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.9924635887145996, + "rewards/margins": 0.8738818168640137, + "rewards/rejected": -4.866345405578613, + "sft_loss": 4.0659685134887695, + "step": 3605 + }, + { + "epoch": 1.932095668171935, + "grad_norm": 15.494806094440678, + "learning_rate": 3.3863521299447514e-07, + "logits/chosen": -0.1938861459493637, + "logits/rejected": -0.08331102132797241, + "logps/chosen": -3.9935669898986816, + "logps/rejected": -4.946460723876953, + "loss": 0.4851, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.9935669898986816, + "rewards/margins": 0.9528937339782715, + "rewards/rejected": -4.946460723876953, + "sft_loss": 4.025681972503662, + "step": 3610 + }, + { + "epoch": 1.9347717009533367, + "grad_norm": 16.30131991335173, + "learning_rate": 3.371619302177609e-07, + "logits/chosen": -0.13050523400306702, + "logits/rejected": -0.03008384443819523, + "logps/chosen": -4.111358642578125, + "logps/rejected": -5.042483329772949, + "loss": 0.5328, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.111358642578125, + "rewards/margins": 0.9311251640319824, + "rewards/rejected": -5.042483329772949, + "sft_loss": 4.114981651306152, + "step": 3615 + }, + { + "epoch": 1.9374477337347382, + "grad_norm": 21.56330380356665, + "learning_rate": 3.3569022729671393e-07, + "logits/chosen": -0.1629718393087387, + "logits/rejected": -0.08070691674947739, + "logps/chosen": -4.215979099273682, + "logps/rejected": -5.047354221343994, + "loss": 0.567, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.215979099273682, + "rewards/margins": 0.8313754200935364, + "rewards/rejected": -5.047354221343994, + "sft_loss": 4.292203426361084, + "step": 3620 + }, + { + "epoch": 1.9401237665161397, + "grad_norm": 16.224892327360532, + "learning_rate": 3.342201185098024e-07, + "logits/chosen": -0.1000644713640213, + "logits/rejected": -0.08347281068563461, + "logps/chosen": -4.007589340209961, + "logps/rejected": -4.895975589752197, + "loss": 0.5239, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.007589340209961, + "rewards/margins": 0.8883861303329468, + "rewards/rejected": -4.895975589752197, + "sft_loss": 4.015894412994385, + "step": 3625 + }, + { + "epoch": 1.9427997992975414, + "grad_norm": 18.77742441655429, + "learning_rate": 3.3275161812002807e-07, + "logits/chosen": -0.15146948397159576, + "logits/rejected": -0.12039715051651001, + "logps/chosen": -4.100152492523193, + "logps/rejected": -5.13730001449585, + "loss": 0.5434, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.100152492523193, + "rewards/margins": 1.0371477603912354, + "rewards/rejected": -5.13730001449585, + "sft_loss": 4.252596378326416, + "step": 3630 + }, + { + "epoch": 1.945475832078943, + "grad_norm": 16.75135604546139, + "learning_rate": 3.312847403747883e-07, + "logits/chosen": -0.21641497313976288, + "logits/rejected": -0.11199178546667099, + "logps/chosen": -4.030362129211426, + "logps/rejected": -5.064505577087402, + "loss": 0.5093, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.030362129211426, + "rewards/margins": 1.0341436862945557, + "rewards/rejected": -5.064505577087402, + "sft_loss": 4.088961124420166, + "step": 3635 + }, + { + "epoch": 1.9481518648603444, + "grad_norm": 16.977008961327975, + "learning_rate": 3.2981949950573733e-07, + "logits/chosen": -0.19566890597343445, + "logits/rejected": -0.12340422719717026, + "logps/chosen": -4.214592933654785, + "logps/rejected": -5.046109676361084, + "loss": 0.5407, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.214592933654785, + "rewards/margins": 0.831516444683075, + "rewards/rejected": -5.046109676361084, + "sft_loss": 4.267856597900391, + "step": 3640 + }, + { + "epoch": 1.9508278976417461, + "grad_norm": 15.751632591599389, + "learning_rate": 3.283559097286486e-07, + "logits/chosen": -0.17926757037639618, + "logits/rejected": -0.0764254480600357, + "logps/chosen": -4.195579528808594, + "logps/rejected": -4.90426778793335, + "loss": 0.578, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.195579528808594, + "rewards/margins": 0.7086880803108215, + "rewards/rejected": -4.90426778793335, + "sft_loss": 4.237638473510742, + "step": 3645 + }, + { + "epoch": 1.9535039304231478, + "grad_norm": 17.69253712852231, + "learning_rate": 3.268939852432765e-07, + "logits/chosen": -0.23232333362102509, + "logits/rejected": -0.16036547720432281, + "logps/chosen": -4.313710689544678, + "logps/rejected": -5.077432632446289, + "loss": 0.5754, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.313710689544678, + "rewards/margins": 0.7637217044830322, + "rewards/rejected": -5.077432632446289, + "sft_loss": 4.4273176193237305, + "step": 3650 + }, + { + "epoch": 1.9561799632045491, + "grad_norm": 17.61474823443831, + "learning_rate": 3.254337402332187e-07, + "logits/chosen": -0.12839333713054657, + "logits/rejected": -0.04960453137755394, + "logps/chosen": -4.242055892944336, + "logps/rejected": -5.083164691925049, + "loss": 0.5837, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.242055892944336, + "rewards/margins": 0.8411084413528442, + "rewards/rejected": -5.083164691925049, + "sft_loss": 4.243819236755371, + "step": 3655 + }, + { + "epoch": 1.9588559959859508, + "grad_norm": 18.573663769325606, + "learning_rate": 3.239751888657788e-07, + "logits/chosen": -0.16934773325920105, + "logits/rejected": -0.07936234027147293, + "logps/chosen": -4.232779026031494, + "logps/rejected": -5.081835746765137, + "loss": 0.5695, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.232779026031494, + "rewards/margins": 0.8490568399429321, + "rewards/rejected": -5.081835746765137, + "sft_loss": 4.286954879760742, + "step": 3660 + }, + { + "epoch": 1.9615320287673526, + "grad_norm": 17.840037475857958, + "learning_rate": 3.2251834529182856e-07, + "logits/chosen": -0.16358472406864166, + "logits/rejected": -0.09386870265007019, + "logps/chosen": -4.062561511993408, + "logps/rejected": -5.115314960479736, + "loss": 0.5287, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.062561511993408, + "rewards/margins": 1.052753210067749, + "rewards/rejected": -5.115314960479736, + "sft_loss": 4.042852878570557, + "step": 3665 + }, + { + "epoch": 1.9642080615487538, + "grad_norm": 16.441670874267846, + "learning_rate": 3.2106322364567075e-07, + "logits/chosen": -0.17569497227668762, + "logits/rejected": -0.08433757722377777, + "logps/chosen": -4.120484352111816, + "logps/rejected": -5.193161964416504, + "loss": 0.4853, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.120484352111816, + "rewards/margins": 1.0726778507232666, + "rewards/rejected": -5.193161964416504, + "sft_loss": 4.253143787384033, + "step": 3670 + }, + { + "epoch": 1.9668840943301555, + "grad_norm": 17.437470155409795, + "learning_rate": 3.1960983804490183e-07, + "logits/chosen": -0.2014768421649933, + "logits/rejected": -0.08412306755781174, + "logps/chosen": -4.331996440887451, + "logps/rejected": -5.299182415008545, + "loss": 0.5817, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.331996440887451, + "rewards/margins": 0.9671859741210938, + "rewards/rejected": -5.299182415008545, + "sft_loss": 4.386224269866943, + "step": 3675 + }, + { + "epoch": 1.9695601271115573, + "grad_norm": 16.337209237194713, + "learning_rate": 3.1815820259027537e-07, + "logits/chosen": -0.16797594726085663, + "logits/rejected": -0.0618429072201252, + "logps/chosen": -3.9733901023864746, + "logps/rejected": -4.945980072021484, + "loss": 0.493, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.9733901023864746, + "rewards/margins": 0.9725903272628784, + "rewards/rejected": -4.945980072021484, + "sft_loss": 4.024728298187256, + "step": 3680 + }, + { + "epoch": 1.9722361598929585, + "grad_norm": 22.587274153214803, + "learning_rate": 3.16708331365565e-07, + "logits/chosen": -0.19083619117736816, + "logits/rejected": -0.11363337188959122, + "logps/chosen": -4.246734142303467, + "logps/rejected": -5.188324928283691, + "loss": 0.5522, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.246734142303467, + "rewards/margins": 0.9415907859802246, + "rewards/rejected": -5.188324928283691, + "sft_loss": 4.359339237213135, + "step": 3685 + }, + { + "epoch": 1.9749121926743602, + "grad_norm": 17.87721609193545, + "learning_rate": 3.152602384374275e-07, + "logits/chosen": -0.15688160061836243, + "logits/rejected": -0.02689887024462223, + "logps/chosen": -4.333113193511963, + "logps/rejected": -5.23205041885376, + "loss": 0.5568, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.333113193511963, + "rewards/margins": 0.8989373445510864, + "rewards/rejected": -5.23205041885376, + "sft_loss": 4.344038963317871, + "step": 3690 + }, + { + "epoch": 1.977588225455762, + "grad_norm": 15.53445740689773, + "learning_rate": 3.1381393785526697e-07, + "logits/chosen": -0.15794029831886292, + "logits/rejected": -0.0844997763633728, + "logps/chosen": -4.22260856628418, + "logps/rejected": -5.181676387786865, + "loss": 0.519, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.22260856628418, + "rewards/margins": 0.9590682983398438, + "rewards/rejected": -5.181676387786865, + "sft_loss": 4.298532962799072, + "step": 3695 + }, + { + "epoch": 1.9802642582371635, + "grad_norm": 15.416147657161021, + "learning_rate": 3.123694436510979e-07, + "logits/chosen": -0.14069563150405884, + "logits/rejected": -0.03486606106162071, + "logps/chosen": -4.119704246520996, + "logps/rejected": -5.0963358879089355, + "loss": 0.5201, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.119704246520996, + "rewards/margins": 0.9766316413879395, + "rewards/rejected": -5.0963358879089355, + "sft_loss": 4.224505424499512, + "step": 3700 + }, + { + "epoch": 1.982940291018565, + "grad_norm": 18.161096420990678, + "learning_rate": 3.1092676983940946e-07, + "logits/chosen": -0.13422636687755585, + "logits/rejected": -0.07517583668231964, + "logps/chosen": -4.114509105682373, + "logps/rejected": -5.2362751960754395, + "loss": 0.4915, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.114509105682373, + "rewards/margins": 1.1217658519744873, + "rewards/rejected": -5.2362751960754395, + "sft_loss": 4.160386085510254, + "step": 3705 + }, + { + "epoch": 1.9856163237999667, + "grad_norm": 18.852324743480146, + "learning_rate": 3.094859304170293e-07, + "logits/chosen": -0.05760626867413521, + "logits/rejected": -0.020621730014681816, + "logps/chosen": -4.1999030113220215, + "logps/rejected": -5.0476579666137695, + "loss": 0.5822, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.1999030113220215, + "rewards/margins": 0.847754955291748, + "rewards/rejected": -5.0476579666137695, + "sft_loss": 4.281563758850098, + "step": 3710 + }, + { + "epoch": 1.9882923565813682, + "grad_norm": 17.976192486737958, + "learning_rate": 3.0804693936298795e-07, + "logits/chosen": -0.1256621778011322, + "logits/rejected": -0.07518371194601059, + "logps/chosen": -4.190843105316162, + "logps/rejected": -5.2432451248168945, + "loss": 0.5056, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.190843105316162, + "rewards/margins": 1.0524019002914429, + "rewards/rejected": -5.2432451248168945, + "sft_loss": 4.277374267578125, + "step": 3715 + }, + { + "epoch": 1.9909683893627697, + "grad_norm": 17.99746447214315, + "learning_rate": 3.066098106383826e-07, + "logits/chosen": -0.1489638090133667, + "logits/rejected": -0.07571511715650558, + "logps/chosen": -4.129316329956055, + "logps/rejected": -5.018464088439941, + "loss": 0.5469, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.129316329956055, + "rewards/margins": 0.8891475796699524, + "rewards/rejected": -5.018464088439941, + "sft_loss": 4.103287696838379, + "step": 3720 + }, + { + "epoch": 1.9936444221441714, + "grad_norm": 16.142726001792713, + "learning_rate": 3.0517455818624263e-07, + "logits/chosen": -0.18727155029773712, + "logits/rejected": -0.0963721051812172, + "logps/chosen": -4.147437572479248, + "logps/rejected": -5.175294399261475, + "loss": 0.4879, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.147437572479248, + "rewards/margins": 1.0278565883636475, + "rewards/rejected": -5.175294399261475, + "sft_loss": 4.290009021759033, + "step": 3725 + }, + { + "epoch": 1.9963204549255729, + "grad_norm": 17.54951804309544, + "learning_rate": 3.037411959313936e-07, + "logits/chosen": -0.1390126645565033, + "logits/rejected": -0.03979702293872833, + "logps/chosen": -4.171058654785156, + "logps/rejected": -5.114262104034424, + "loss": 0.5063, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.171058654785156, + "rewards/margins": 0.9432040452957153, + "rewards/rejected": -5.114262104034424, + "sft_loss": 4.278780937194824, + "step": 3730 + }, + { + "epoch": 1.9989964877069744, + "grad_norm": 24.23113549285882, + "learning_rate": 3.023097377803224e-07, + "logits/chosen": -0.1034446507692337, + "logits/rejected": -0.03407427668571472, + "logps/chosen": -4.303669452667236, + "logps/rejected": -5.211419105529785, + "loss": 0.5779, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.303669452667236, + "rewards/margins": 0.9077495336532593, + "rewards/rejected": -5.211419105529785, + "sft_loss": 4.328175067901611, + "step": 3735 + }, + { + "epoch": 2.001672520488376, + "grad_norm": 19.12008536365398, + "learning_rate": 3.008801976210423e-07, + "logits/chosen": -0.0796082392334938, + "logits/rejected": -0.0246458537876606, + "logps/chosen": -4.290400505065918, + "logps/rejected": -5.158785820007324, + "loss": 0.5272, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.290400505065918, + "rewards/margins": 0.8683861494064331, + "rewards/rejected": -5.158785820007324, + "sft_loss": 4.287832736968994, + "step": 3740 + }, + { + "epoch": 2.0043485532697773, + "grad_norm": 16.47165153331192, + "learning_rate": 2.994525893229581e-07, + "logits/chosen": -0.12000022828578949, + "logits/rejected": -0.058578480035066605, + "logps/chosen": -4.189848899841309, + "logps/rejected": -5.388810157775879, + "loss": 0.4366, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.189848899841309, + "rewards/margins": 1.198961615562439, + "rewards/rejected": -5.388810157775879, + "sft_loss": 4.208093643188477, + "step": 3745 + }, + { + "epoch": 2.007024586051179, + "grad_norm": 15.261664502257158, + "learning_rate": 2.98026926736732e-07, + "logits/chosen": -0.18162012100219727, + "logits/rejected": -0.09842869639396667, + "logps/chosen": -4.083524703979492, + "logps/rejected": -5.226982593536377, + "loss": 0.4649, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.083524703979492, + "rewards/margins": 1.1434574127197266, + "rewards/rejected": -5.226982593536377, + "sft_loss": 4.203307151794434, + "step": 3750 + }, + { + "epoch": 2.0097006188325808, + "grad_norm": 13.915547135421344, + "learning_rate": 2.9660322369414846e-07, + "logits/chosen": -0.1771707683801651, + "logits/rejected": -0.054735828191041946, + "logps/chosen": -4.246321201324463, + "logps/rejected": -5.439396858215332, + "loss": 0.4377, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.246321201324463, + "rewards/margins": 1.1930756568908691, + "rewards/rejected": -5.439396858215332, + "sft_loss": 4.391330242156982, + "step": 3755 + }, + { + "epoch": 2.0123766516139825, + "grad_norm": 13.860572104307709, + "learning_rate": 2.9518149400798063e-07, + "logits/chosen": -0.1839606910943985, + "logits/rejected": -0.12588202953338623, + "logps/chosen": -4.3138556480407715, + "logps/rejected": -5.5978102684021, + "loss": 0.4409, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.3138556480407715, + "rewards/margins": 1.2839547395706177, + "rewards/rejected": -5.5978102684021, + "sft_loss": 4.4018049240112305, + "step": 3760 + }, + { + "epoch": 2.0150526843953838, + "grad_norm": 20.756274256793866, + "learning_rate": 2.9376175147185633e-07, + "logits/chosen": -0.14228351414203644, + "logits/rejected": 0.011768890544772148, + "logps/chosen": -4.53744649887085, + "logps/rejected": -5.743522644042969, + "loss": 0.4872, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.53744649887085, + "rewards/margins": 1.20607590675354, + "rewards/rejected": -5.743522644042969, + "sft_loss": 4.558747291564941, + "step": 3765 + }, + { + "epoch": 2.0177287171767855, + "grad_norm": 23.859097029142497, + "learning_rate": 2.9234400986012376e-07, + "logits/chosen": -0.22040753066539764, + "logits/rejected": -0.08732731640338898, + "logps/chosen": -4.480866432189941, + "logps/rejected": -5.846318244934082, + "loss": 0.4618, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.480866432189941, + "rewards/margins": 1.365451455116272, + "rewards/rejected": -5.846318244934082, + "sft_loss": 4.562276840209961, + "step": 3770 + }, + { + "epoch": 2.020404749958187, + "grad_norm": 23.37780742989256, + "learning_rate": 2.9092828292771817e-07, + "logits/chosen": -0.10928299278020859, + "logits/rejected": -0.07061926275491714, + "logps/chosen": -4.536766052246094, + "logps/rejected": -5.758551597595215, + "loss": 0.4649, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.536766052246094, + "rewards/margins": 1.2217856645584106, + "rewards/rejected": -5.758551597595215, + "sft_loss": 4.541500091552734, + "step": 3775 + }, + { + "epoch": 2.0230807827395885, + "grad_norm": 16.277017836639978, + "learning_rate": 2.8951458441002875e-07, + "logits/chosen": -0.15758976340293884, + "logits/rejected": -0.11704935133457184, + "logps/chosen": -4.346386909484863, + "logps/rejected": -5.59005880355835, + "loss": 0.4565, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.346386909484863, + "rewards/margins": 1.2436727285385132, + "rewards/rejected": -5.59005880355835, + "sft_loss": 4.381421089172363, + "step": 3780 + }, + { + "epoch": 2.02575681552099, + "grad_norm": 15.324472681567837, + "learning_rate": 2.881029280227643e-07, + "logits/chosen": -0.18570008873939514, + "logits/rejected": -0.06011120602488518, + "logps/chosen": -4.408538341522217, + "logps/rejected": -5.611115455627441, + "loss": 0.476, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.408538341522217, + "rewards/margins": 1.2025763988494873, + "rewards/rejected": -5.611115455627441, + "sft_loss": 4.448096752166748, + "step": 3785 + }, + { + "epoch": 2.028432848302392, + "grad_norm": 13.550043619241059, + "learning_rate": 2.8669332746182177e-07, + "logits/chosen": -0.20547811686992645, + "logits/rejected": -0.07174699753522873, + "logps/chosen": -4.346865177154541, + "logps/rejected": -5.615252494812012, + "loss": 0.4546, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.346865177154541, + "rewards/margins": 1.268387794494629, + "rewards/rejected": -5.615252494812012, + "sft_loss": 4.4165849685668945, + "step": 3790 + }, + { + "epoch": 2.031108881083793, + "grad_norm": 18.022751885568077, + "learning_rate": 2.8528579640315156e-07, + "logits/chosen": -0.11128351837396622, + "logits/rejected": -0.06616206467151642, + "logps/chosen": -4.275948524475098, + "logps/rejected": -5.372910976409912, + "loss": 0.4918, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.275948524475098, + "rewards/margins": 1.096962571144104, + "rewards/rejected": -5.372910976409912, + "sft_loss": 4.3458147048950195, + "step": 3795 + }, + { + "epoch": 2.033784913865195, + "grad_norm": 22.99000433508882, + "learning_rate": 2.8388034850262646e-07, + "logits/chosen": -0.1808985471725464, + "logits/rejected": -0.0658467561006546, + "logps/chosen": -4.3522162437438965, + "logps/rejected": -5.611857891082764, + "loss": 0.4523, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.3522162437438965, + "rewards/margins": 1.2596412897109985, + "rewards/rejected": -5.611857891082764, + "sft_loss": 4.430613040924072, + "step": 3800 + }, + { + "epoch": 2.0364609466465966, + "grad_norm": 24.666474774126513, + "learning_rate": 2.824769973959079e-07, + "logits/chosen": -0.13231855630874634, + "logits/rejected": -0.009453452192246914, + "logps/chosen": -4.427382469177246, + "logps/rejected": -5.60190486907959, + "loss": 0.4588, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.427382469177246, + "rewards/margins": 1.1745221614837646, + "rewards/rejected": -5.60190486907959, + "sft_loss": 4.450659275054932, + "step": 3805 + }, + { + "epoch": 2.039136979427998, + "grad_norm": 16.447166238503364, + "learning_rate": 2.81075756698315e-07, + "logits/chosen": -0.09605690091848373, + "logits/rejected": 0.001893743872642517, + "logps/chosen": -4.376564025878906, + "logps/rejected": -5.6669602394104, + "loss": 0.4185, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.376564025878906, + "rewards/margins": 1.290395975112915, + "rewards/rejected": -5.6669602394104, + "sft_loss": 4.277453422546387, + "step": 3810 + }, + { + "epoch": 2.0418130122093996, + "grad_norm": 17.66655526695026, + "learning_rate": 2.7967664000469035e-07, + "logits/chosen": -0.1871604323387146, + "logits/rejected": -0.09622781723737717, + "logps/chosen": -4.417603492736816, + "logps/rejected": -5.642507076263428, + "loss": 0.4296, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.417603492736816, + "rewards/margins": 1.2249035835266113, + "rewards/rejected": -5.642507076263428, + "sft_loss": 4.344175815582275, + "step": 3815 + }, + { + "epoch": 2.0444890449908013, + "grad_norm": 17.318780900757023, + "learning_rate": 2.7827966088927095e-07, + "logits/chosen": -0.22635912895202637, + "logits/rejected": -0.02497316151857376, + "logps/chosen": -4.55142879486084, + "logps/rejected": -5.799140930175781, + "loss": 0.4524, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.55142879486084, + "rewards/margins": 1.2477116584777832, + "rewards/rejected": -5.799140930175781, + "sft_loss": 4.594216823577881, + "step": 3820 + }, + { + "epoch": 2.0471650777722026, + "grad_norm": 17.86834053588433, + "learning_rate": 2.768848329055538e-07, + "logits/chosen": -0.14959703385829926, + "logits/rejected": -0.09499450027942657, + "logps/chosen": -4.484490394592285, + "logps/rejected": -5.76265811920166, + "loss": 0.4261, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.484490394592285, + "rewards/margins": 1.2781678438186646, + "rewards/rejected": -5.76265811920166, + "sft_loss": 4.595266342163086, + "step": 3825 + }, + { + "epoch": 2.0498411105536043, + "grad_norm": 22.682792372551617, + "learning_rate": 2.7549216958616657e-07, + "logits/chosen": -0.2396484911441803, + "logits/rejected": -0.10936751216650009, + "logps/chosen": -4.647583961486816, + "logps/rejected": -6.019547462463379, + "loss": 0.4402, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.647583961486816, + "rewards/margins": 1.3719635009765625, + "rewards/rejected": -6.019547462463379, + "sft_loss": 4.67317008972168, + "step": 3830 + }, + { + "epoch": 2.052517143335006, + "grad_norm": 14.747593491123746, + "learning_rate": 2.741016844427344e-07, + "logits/chosen": -0.17778167128562927, + "logits/rejected": -0.03496559336781502, + "logps/chosen": -4.572454452514648, + "logps/rejected": -5.8602800369262695, + "loss": 0.4438, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.572454452514648, + "rewards/margins": 1.287825107574463, + "rewards/rejected": -5.8602800369262695, + "sft_loss": 4.662432670593262, + "step": 3835 + }, + { + "epoch": 2.0551931761164073, + "grad_norm": 17.514615706709293, + "learning_rate": 2.7271339096575073e-07, + "logits/chosen": -0.12904831767082214, + "logits/rejected": -0.016960179433226585, + "logps/chosen": -4.486534118652344, + "logps/rejected": -5.739012241363525, + "loss": 0.4474, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.486534118652344, + "rewards/margins": 1.2524782419204712, + "rewards/rejected": -5.739012241363525, + "sft_loss": 4.5322442054748535, + "step": 3840 + }, + { + "epoch": 2.057869208897809, + "grad_norm": 14.513457288617543, + "learning_rate": 2.713273026244446e-07, + "logits/chosen": -0.20153093338012695, + "logits/rejected": -0.033189572393894196, + "logps/chosen": -4.680139064788818, + "logps/rejected": -6.050618648529053, + "loss": 0.4093, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.680139064788818, + "rewards/margins": 1.3704793453216553, + "rewards/rejected": -6.050618648529053, + "sft_loss": 4.678463935852051, + "step": 3845 + }, + { + "epoch": 2.0605452416792107, + "grad_norm": 18.202688604141244, + "learning_rate": 2.6994343286665156e-07, + "logits/chosen": -0.16980481147766113, + "logits/rejected": -0.024973779916763306, + "logps/chosen": -4.6503071784973145, + "logps/rejected": -5.726879119873047, + "loss": 0.4963, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.6503071784973145, + "rewards/margins": 1.0765719413757324, + "rewards/rejected": -5.726879119873047, + "sft_loss": 4.77303409576416, + "step": 3850 + }, + { + "epoch": 2.063221274460612, + "grad_norm": 22.41727403559302, + "learning_rate": 2.6856179511868156e-07, + "logits/chosen": -0.12804751098155975, + "logits/rejected": 0.015957217663526535, + "logps/chosen": -4.61462926864624, + "logps/rejected": -6.049999237060547, + "loss": 0.4642, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.61462926864624, + "rewards/margins": 1.4353702068328857, + "rewards/rejected": -6.049999237060547, + "sft_loss": 4.627757549285889, + "step": 3855 + }, + { + "epoch": 2.0658973072420137, + "grad_norm": 23.925564315420097, + "learning_rate": 2.6718240278519056e-07, + "logits/chosen": -0.1258188784122467, + "logits/rejected": -0.03270752355456352, + "logps/chosen": -4.608656883239746, + "logps/rejected": -5.895892143249512, + "loss": 0.456, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.608656883239746, + "rewards/margins": 1.287234902381897, + "rewards/rejected": -5.895892143249512, + "sft_loss": 4.617652893066406, + "step": 3860 + }, + { + "epoch": 2.0685733400234154, + "grad_norm": 18.932471018151297, + "learning_rate": 2.6580526924904866e-07, + "logits/chosen": -0.23533248901367188, + "logits/rejected": -0.11765459924936295, + "logps/chosen": -4.57756233215332, + "logps/rejected": -5.859441757202148, + "loss": 0.4244, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.57756233215332, + "rewards/margins": 1.2818796634674072, + "rewards/rejected": -5.859441757202148, + "sft_loss": 4.577887058258057, + "step": 3865 + }, + { + "epoch": 2.0712493728048167, + "grad_norm": 21.18272309195914, + "learning_rate": 2.6443040787121186e-07, + "logits/chosen": -0.2512609362602234, + "logits/rejected": -0.16672050952911377, + "logps/chosen": -4.517947673797607, + "logps/rejected": -5.76108980178833, + "loss": 0.463, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.517947673797607, + "rewards/margins": 1.2431416511535645, + "rewards/rejected": -5.76108980178833, + "sft_loss": 4.614432334899902, + "step": 3870 + }, + { + "epoch": 2.0739254055862184, + "grad_norm": 19.19306771230631, + "learning_rate": 2.6305783199059084e-07, + "logits/chosen": -0.16765734553337097, + "logits/rejected": -0.06805787980556488, + "logps/chosen": -4.5621490478515625, + "logps/rejected": -5.844240188598633, + "loss": 0.4722, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.5621490478515625, + "rewards/margins": 1.2820910215377808, + "rewards/rejected": -5.844240188598633, + "sft_loss": 4.612701416015625, + "step": 3875 + }, + { + "epoch": 2.07660143836762, + "grad_norm": 20.707099640973883, + "learning_rate": 2.6168755492392324e-07, + "logits/chosen": -0.14436683058738708, + "logits/rejected": -0.009703554213047028, + "logps/chosen": -4.375124931335449, + "logps/rejected": -5.790663719177246, + "loss": 0.408, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.375124931335449, + "rewards/margins": 1.4155378341674805, + "rewards/rejected": -5.790663719177246, + "sft_loss": 4.3841071128845215, + "step": 3880 + }, + { + "epoch": 2.0792774711490214, + "grad_norm": 21.6609372072673, + "learning_rate": 2.6031958996564274e-07, + "logits/chosen": -0.13693545758724213, + "logits/rejected": -0.043102920055389404, + "logps/chosen": -4.388991832733154, + "logps/rejected": -5.805684566497803, + "loss": 0.4339, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.388991832733154, + "rewards/margins": 1.416691541671753, + "rewards/rejected": -5.805684566497803, + "sft_loss": 4.460447788238525, + "step": 3885 + }, + { + "epoch": 2.081953503930423, + "grad_norm": 25.05993756394716, + "learning_rate": 2.589539503877518e-07, + "logits/chosen": -0.11307881772518158, + "logits/rejected": -0.025426015257835388, + "logps/chosen": -4.619956970214844, + "logps/rejected": -5.880894184112549, + "loss": 0.4679, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.619956970214844, + "rewards/margins": 1.2609364986419678, + "rewards/rejected": -5.880894184112549, + "sft_loss": 4.613383769989014, + "step": 3890 + }, + { + "epoch": 2.084629536711825, + "grad_norm": 15.459973618296203, + "learning_rate": 2.5759064943969125e-07, + "logits/chosen": -0.19261129200458527, + "logits/rejected": -0.010503212921321392, + "logps/chosen": -4.513919353485107, + "logps/rejected": -5.790491580963135, + "loss": 0.4555, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.513919353485107, + "rewards/margins": 1.27657151222229, + "rewards/rejected": -5.790491580963135, + "sft_loss": 4.4933366775512695, + "step": 3895 + }, + { + "epoch": 2.087305569493226, + "grad_norm": 21.559862109780585, + "learning_rate": 2.562297003482131e-07, + "logits/chosen": -0.10432785749435425, + "logits/rejected": -0.0616467110812664, + "logps/chosen": -4.550601005554199, + "logps/rejected": -5.83347225189209, + "loss": 0.451, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.550601005554199, + "rewards/margins": 1.282871127128601, + "rewards/rejected": -5.83347225189209, + "sft_loss": 4.568717002868652, + "step": 3900 + }, + { + "epoch": 2.089981602274628, + "grad_norm": 16.97024088090951, + "learning_rate": 2.548711163172512e-07, + "logits/chosen": -0.11373122781515121, + "logits/rejected": -0.02093740925192833, + "logps/chosen": -4.710618495941162, + "logps/rejected": -5.993484020233154, + "loss": 0.4828, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.710618495941162, + "rewards/margins": 1.2828651666641235, + "rewards/rejected": -5.993484020233154, + "sft_loss": 4.71906042098999, + "step": 3905 + }, + { + "epoch": 2.0926576350560295, + "grad_norm": 18.231620168854963, + "learning_rate": 2.53514910527794e-07, + "logits/chosen": -0.12412917613983154, + "logits/rejected": -0.02643829584121704, + "logps/chosen": -4.418890476226807, + "logps/rejected": -5.6583147048950195, + "loss": 0.4517, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.418890476226807, + "rewards/margins": 1.2394243478775024, + "rewards/rejected": -5.6583147048950195, + "sft_loss": 4.488974571228027, + "step": 3910 + }, + { + "epoch": 2.095333667837431, + "grad_norm": 22.63151693266803, + "learning_rate": 2.5216109613775573e-07, + "logits/chosen": -0.1703699827194214, + "logits/rejected": -0.04086681082844734, + "logps/chosen": -4.718385219573975, + "logps/rejected": -5.87748384475708, + "loss": 0.4958, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.718385219573975, + "rewards/margins": 1.1590980291366577, + "rewards/rejected": -5.87748384475708, + "sft_loss": 4.733608722686768, + "step": 3915 + }, + { + "epoch": 2.0980097006188325, + "grad_norm": 19.158970635544666, + "learning_rate": 2.5080968628184993e-07, + "logits/chosen": -0.19361944496631622, + "logits/rejected": -0.0514645092189312, + "logps/chosen": -4.5244221687316895, + "logps/rejected": -6.010369300842285, + "loss": 0.4163, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.5244221687316895, + "rewards/margins": 1.4859468936920166, + "rewards/rejected": -6.010369300842285, + "sft_loss": 4.477473258972168, + "step": 3920 + }, + { + "epoch": 2.1006857334002342, + "grad_norm": 14.404258310123362, + "learning_rate": 2.494606940714605e-07, + "logits/chosen": -0.13890644907951355, + "logits/rejected": -0.06422574818134308, + "logps/chosen": -4.478633880615234, + "logps/rejected": -5.896307945251465, + "loss": 0.4169, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.478633880615234, + "rewards/margins": 1.4176738262176514, + "rewards/rejected": -5.896307945251465, + "sft_loss": 4.554999351501465, + "step": 3925 + }, + { + "epoch": 2.103361766181636, + "grad_norm": 14.82955366511182, + "learning_rate": 2.4811413259451625e-07, + "logits/chosen": -0.19433699548244476, + "logits/rejected": -0.041183482855558395, + "logps/chosen": -4.494211673736572, + "logps/rejected": -5.957503795623779, + "loss": 0.4309, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.494211673736572, + "rewards/margins": 1.4632922410964966, + "rewards/rejected": -5.957503795623779, + "sft_loss": 4.525946617126465, + "step": 3930 + }, + { + "epoch": 2.106037798963037, + "grad_norm": 19.36649542789264, + "learning_rate": 2.46770014915362e-07, + "logits/chosen": -0.1541624218225479, + "logits/rejected": -0.05928944796323776, + "logps/chosen": -4.555628776550293, + "logps/rejected": -5.938043594360352, + "loss": 0.4473, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.555628776550293, + "rewards/margins": 1.38241446018219, + "rewards/rejected": -5.938043594360352, + "sft_loss": 4.585000991821289, + "step": 3935 + }, + { + "epoch": 2.108713831744439, + "grad_norm": 22.316750205317803, + "learning_rate": 2.45428354074634e-07, + "logits/chosen": -0.16120049357414246, + "logits/rejected": -0.09427209198474884, + "logps/chosen": -4.504244804382324, + "logps/rejected": -5.9012041091918945, + "loss": 0.4642, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.504244804382324, + "rewards/margins": 1.3969593048095703, + "rewards/rejected": -5.9012041091918945, + "sft_loss": 4.468905925750732, + "step": 3940 + }, + { + "epoch": 2.1113898645258407, + "grad_norm": 22.710270989091576, + "learning_rate": 2.4408916308913105e-07, + "logits/chosen": -0.2358735054731369, + "logits/rejected": -0.10350503772497177, + "logps/chosen": -4.722151279449463, + "logps/rejected": -5.764187812805176, + "loss": 0.5286, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.722151279449463, + "rewards/margins": 1.0420368909835815, + "rewards/rejected": -5.764187812805176, + "sft_loss": 4.828040599822998, + "step": 3945 + }, + { + "epoch": 2.114065897307242, + "grad_norm": 31.008446115751816, + "learning_rate": 2.4275245495169025e-07, + "logits/chosen": -0.127239391207695, + "logits/rejected": 0.017133042216300964, + "logps/chosen": -4.513893127441406, + "logps/rejected": -5.921486854553223, + "loss": 0.4369, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.513893127441406, + "rewards/margins": 1.407593846321106, + "rewards/rejected": -5.921486854553223, + "sft_loss": 4.540274620056152, + "step": 3950 + }, + { + "epoch": 2.1167419300886436, + "grad_norm": 22.636623943914152, + "learning_rate": 2.414182426310597e-07, + "logits/chosen": -0.18237945437431335, + "logits/rejected": -0.11283756792545319, + "logps/chosen": -4.60231876373291, + "logps/rejected": -6.050238609313965, + "loss": 0.4553, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.60231876373291, + "rewards/margins": 1.4479202032089233, + "rewards/rejected": -6.050238609313965, + "sft_loss": 4.625402927398682, + "step": 3955 + }, + { + "epoch": 2.1194179628700454, + "grad_norm": 12.845987005897252, + "learning_rate": 2.400865390717734e-07, + "logits/chosen": -0.13675661385059357, + "logits/rejected": -0.032407719641923904, + "logps/chosen": -4.471343040466309, + "logps/rejected": -6.141912460327148, + "loss": 0.3834, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.471343040466309, + "rewards/margins": 1.670569658279419, + "rewards/rejected": -6.141912460327148, + "sft_loss": 4.563107967376709, + "step": 3960 + }, + { + "epoch": 2.1220939956514466, + "grad_norm": 22.196012568551822, + "learning_rate": 2.3875735719402475e-07, + "logits/chosen": -0.15787668526172638, + "logits/rejected": -0.027476048097014427, + "logps/chosen": -4.659394264221191, + "logps/rejected": -6.09100341796875, + "loss": 0.4359, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.659394264221191, + "rewards/margins": 1.43160879611969, + "rewards/rejected": -6.09100341796875, + "sft_loss": 4.784261226654053, + "step": 3965 + }, + { + "epoch": 2.1247700284328483, + "grad_norm": 16.456475986320505, + "learning_rate": 2.3743070989354258e-07, + "logits/chosen": -0.1013205274939537, + "logits/rejected": -0.004166866652667522, + "logps/chosen": -4.576447010040283, + "logps/rejected": -6.026632308959961, + "loss": 0.4768, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.576447010040283, + "rewards/margins": 1.4501855373382568, + "rewards/rejected": -6.026632308959961, + "sft_loss": 4.765728950500488, + "step": 3970 + }, + { + "epoch": 2.12744606121425, + "grad_norm": 20.356120149703255, + "learning_rate": 2.3610661004146454e-07, + "logits/chosen": -0.09850762784481049, + "logits/rejected": 0.010118888691067696, + "logps/chosen": -4.386921405792236, + "logps/rejected": -5.722302436828613, + "loss": 0.4143, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.386921405792236, + "rewards/margins": 1.3353809118270874, + "rewards/rejected": -5.722302436828613, + "sft_loss": 4.426734447479248, + "step": 3975 + }, + { + "epoch": 2.1301220939956513, + "grad_norm": 16.710172332706804, + "learning_rate": 2.3478507048421314e-07, + "logits/chosen": -0.1513948142528534, + "logits/rejected": -0.08292113244533539, + "logps/chosen": -4.454089164733887, + "logps/rejected": -5.841625213623047, + "loss": 0.4568, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.454089164733887, + "rewards/margins": 1.3875354528427124, + "rewards/rejected": -5.841625213623047, + "sft_loss": 4.680628776550293, + "step": 3980 + }, + { + "epoch": 2.132798126777053, + "grad_norm": 25.683784114058085, + "learning_rate": 2.334661040433713e-07, + "logits/chosen": -0.1821201890707016, + "logits/rejected": -0.07850952446460724, + "logps/chosen": -4.567347526550293, + "logps/rejected": -5.961523056030273, + "loss": 0.4302, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.567347526550293, + "rewards/margins": 1.3941757678985596, + "rewards/rejected": -5.961523056030273, + "sft_loss": 4.69355583190918, + "step": 3985 + }, + { + "epoch": 2.1354741595584548, + "grad_norm": 17.18321446845876, + "learning_rate": 2.321497235155568e-07, + "logits/chosen": -0.21645669639110565, + "logits/rejected": -0.10848214477300644, + "logps/chosen": -4.355871677398682, + "logps/rejected": -5.764237403869629, + "loss": 0.409, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.355871677398682, + "rewards/margins": 1.4083654880523682, + "rewards/rejected": -5.764237403869629, + "sft_loss": 4.4289021492004395, + "step": 3990 + }, + { + "epoch": 2.138150192339856, + "grad_norm": 24.6685682814048, + "learning_rate": 2.3083594167229965e-07, + "logits/chosen": -0.21960720419883728, + "logits/rejected": -0.020451117306947708, + "logps/chosen": -4.579361915588379, + "logps/rejected": -5.956488132476807, + "loss": 0.4663, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.579361915588379, + "rewards/margins": 1.3771260976791382, + "rewards/rejected": -5.956488132476807, + "sft_loss": 4.609208106994629, + "step": 3995 + }, + { + "epoch": 2.1408262251212578, + "grad_norm": 24.724242452029113, + "learning_rate": 2.295247712599167e-07, + "logits/chosen": -0.16371068358421326, + "logits/rejected": -0.09416019916534424, + "logps/chosen": -4.51413631439209, + "logps/rejected": -5.884792327880859, + "loss": 0.4692, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.51413631439209, + "rewards/margins": 1.3706560134887695, + "rewards/rejected": -5.884792327880859, + "sft_loss": 4.492763996124268, + "step": 4000 + }, + { + "epoch": 2.1408262251212578, + "eval_logits/chosen": 0.07501842081546783, + "eval_logits/rejected": 0.15692532062530518, + "eval_logps/chosen": -4.701979637145996, + "eval_logps/rejected": -5.741509437561035, + "eval_loss": 0.5829208493232727, + "eval_rewards/accuracies": 0.7299703359603882, + "eval_rewards/chosen": -4.701979637145996, + "eval_rewards/margins": 1.0395296812057495, + "eval_rewards/rejected": -5.741509437561035, + "eval_runtime": 43.1772, + "eval_samples_per_second": 31.151, + "eval_sft_loss": 4.6998491287231445, + "eval_steps_per_second": 7.805, + "step": 4000 + }, + { + "epoch": 2.1435022579026595, + "grad_norm": 18.413899749315984, + "learning_rate": 2.2821622499938948e-07, + "logits/chosen": -0.11675839126110077, + "logits/rejected": 0.04759988561272621, + "logps/chosen": -4.67446231842041, + "logps/rejected": -5.886545658111572, + "loss": 0.4805, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.67446231842041, + "rewards/margins": 1.212082862854004, + "rewards/rejected": -5.886545658111572, + "sft_loss": 4.659073352813721, + "step": 4005 + }, + { + "epoch": 2.1461782906840607, + "grad_norm": 22.822288857524793, + "learning_rate": 2.269103155862391e-07, + "logits/chosen": -0.17632398009300232, + "logits/rejected": -0.07813229411840439, + "logps/chosen": -4.495655536651611, + "logps/rejected": -5.676532745361328, + "loss": 0.4882, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.495655536651611, + "rewards/margins": 1.1808770895004272, + "rewards/rejected": -5.676532745361328, + "sft_loss": 4.489293098449707, + "step": 4010 + }, + { + "epoch": 2.1488543234654625, + "grad_norm": 21.430619108993792, + "learning_rate": 2.2560705569040483e-07, + "logits/chosen": -0.18073612451553345, + "logits/rejected": 0.013807791285216808, + "logps/chosen": -4.501663684844971, + "logps/rejected": -5.728493690490723, + "loss": 0.4803, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.501663684844971, + "rewards/margins": 1.2268303632736206, + "rewards/rejected": -5.728493690490723, + "sft_loss": 4.570557594299316, + "step": 4015 + }, + { + "epoch": 2.151530356246864, + "grad_norm": 16.281583131218813, + "learning_rate": 2.2430645795611963e-07, + "logits/chosen": -0.2418348342180252, + "logits/rejected": -0.10622209310531616, + "logps/chosen": -4.473881721496582, + "logps/rejected": -5.790538311004639, + "loss": 0.4472, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.473881721496582, + "rewards/margins": 1.3166567087173462, + "rewards/rejected": -5.790538311004639, + "sft_loss": 4.559691429138184, + "step": 4020 + }, + { + "epoch": 2.1542063890282654, + "grad_norm": 25.919500533512633, + "learning_rate": 2.230085350017884e-07, + "logits/chosen": -0.12916305661201477, + "logits/rejected": -0.04332312196493149, + "logps/chosen": -4.4670305252075195, + "logps/rejected": -5.607809543609619, + "loss": 0.5011, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.4670305252075195, + "rewards/margins": 1.1407783031463623, + "rewards/rejected": -5.607809543609619, + "sft_loss": 4.542896270751953, + "step": 4025 + }, + { + "epoch": 2.156882421809667, + "grad_norm": 17.465509733604573, + "learning_rate": 2.2171329941986554e-07, + "logits/chosen": -0.20880010724067688, + "logits/rejected": -0.10321755707263947, + "logps/chosen": -4.294320106506348, + "logps/rejected": -5.752496719360352, + "loss": 0.3927, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.294320106506348, + "rewards/margins": 1.4581772089004517, + "rewards/rejected": -5.752496719360352, + "sft_loss": 4.305559158325195, + "step": 4030 + }, + { + "epoch": 2.159558454591069, + "grad_norm": 15.467504253265371, + "learning_rate": 2.2042076377673202e-07, + "logits/chosen": -0.1964549869298935, + "logits/rejected": -0.17378666996955872, + "logps/chosen": -4.268540859222412, + "logps/rejected": -5.400685787200928, + "loss": 0.4746, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.268540859222412, + "rewards/margins": 1.132144808769226, + "rewards/rejected": -5.400685787200928, + "sft_loss": 4.281256675720215, + "step": 4035 + }, + { + "epoch": 2.16223448737247, + "grad_norm": 19.621379459284213, + "learning_rate": 2.1913094061257476e-07, + "logits/chosen": -0.17080268263816833, + "logits/rejected": -0.16464689373970032, + "logps/chosen": -4.361598491668701, + "logps/rejected": -5.593031883239746, + "loss": 0.4473, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.361598491668701, + "rewards/margins": 1.231433391571045, + "rewards/rejected": -5.593031883239746, + "sft_loss": 4.3913750648498535, + "step": 4040 + }, + { + "epoch": 2.164910520153872, + "grad_norm": 23.171660283421172, + "learning_rate": 2.178438424412633e-07, + "logits/chosen": -0.1492532193660736, + "logits/rejected": -0.03700689598917961, + "logps/chosen": -4.333517551422119, + "logps/rejected": -5.488014221191406, + "loss": 0.4813, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.333517551422119, + "rewards/margins": 1.1544969081878662, + "rewards/rejected": -5.488014221191406, + "sft_loss": 4.4080305099487305, + "step": 4045 + }, + { + "epoch": 2.1675865529352736, + "grad_norm": 27.373463948574507, + "learning_rate": 2.165594817502302e-07, + "logits/chosen": -0.20800991356372833, + "logits/rejected": -0.10373286157846451, + "logps/chosen": -4.645865440368652, + "logps/rejected": -5.66098165512085, + "loss": 0.5345, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.645865440368652, + "rewards/margins": 1.0151169300079346, + "rewards/rejected": -5.66098165512085, + "sft_loss": 4.785841941833496, + "step": 4050 + }, + { + "epoch": 2.170262585716675, + "grad_norm": 20.341752564006676, + "learning_rate": 2.1527787100034806e-07, + "logits/chosen": -0.16706502437591553, + "logits/rejected": -0.09402771294116974, + "logps/chosen": -4.453884601593018, + "logps/rejected": -5.4885149002075195, + "loss": 0.49, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.453884601593018, + "rewards/margins": 1.0346308946609497, + "rewards/rejected": -5.4885149002075195, + "sft_loss": 4.488232612609863, + "step": 4055 + }, + { + "epoch": 2.1729386184980766, + "grad_norm": 17.91385130623742, + "learning_rate": 2.1399902262581037e-07, + "logits/chosen": -0.09191139042377472, + "logits/rejected": 0.025123313069343567, + "logps/chosen": -4.451213836669922, + "logps/rejected": -5.648017883300781, + "loss": 0.4838, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.451213836669922, + "rewards/margins": 1.196804165840149, + "rewards/rejected": -5.648017883300781, + "sft_loss": 4.565667629241943, + "step": 4060 + }, + { + "epoch": 2.1756146512794783, + "grad_norm": 20.33735857354431, + "learning_rate": 2.127229490340094e-07, + "logits/chosen": -0.23012490570545197, + "logits/rejected": -0.1606210172176361, + "logps/chosen": -4.371445655822754, + "logps/rejected": -5.787907123565674, + "loss": 0.4296, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.371445655822754, + "rewards/margins": 1.4164615869522095, + "rewards/rejected": -5.787907123565674, + "sft_loss": 4.472637176513672, + "step": 4065 + }, + { + "epoch": 2.1782906840608796, + "grad_norm": 22.32451002093342, + "learning_rate": 2.1144966260541698e-07, + "logits/chosen": -0.14594785869121552, + "logits/rejected": -0.002728702500462532, + "logps/chosen": -4.419862747192383, + "logps/rejected": -5.816904067993164, + "loss": 0.4756, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.419862747192383, + "rewards/margins": 1.3970410823822021, + "rewards/rejected": -5.816904067993164, + "sft_loss": 4.503393650054932, + "step": 4070 + }, + { + "epoch": 2.1809667168422813, + "grad_norm": 16.866151143936293, + "learning_rate": 2.1017917569346332e-07, + "logits/chosen": -0.20224475860595703, + "logits/rejected": -0.0694878101348877, + "logps/chosen": -4.323914051055908, + "logps/rejected": -5.589296340942383, + "loss": 0.4463, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.323914051055908, + "rewards/margins": 1.2653824090957642, + "rewards/rejected": -5.589296340942383, + "sft_loss": 4.306329727172852, + "step": 4075 + }, + { + "epoch": 2.183642749623683, + "grad_norm": 17.495137641658133, + "learning_rate": 2.0891150062441837e-07, + "logits/chosen": -0.20146453380584717, + "logits/rejected": -0.09485888481140137, + "logps/chosen": -4.480501174926758, + "logps/rejected": -5.844509601593018, + "loss": 0.4583, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.480501174926758, + "rewards/margins": 1.3640084266662598, + "rewards/rejected": -5.844509601593018, + "sft_loss": 4.449831008911133, + "step": 4080 + }, + { + "epoch": 2.1863187824050843, + "grad_norm": 19.05490248938216, + "learning_rate": 2.0764664969727086e-07, + "logits/chosen": -0.1656581461429596, + "logits/rejected": -0.09022749960422516, + "logps/chosen": -4.398980617523193, + "logps/rejected": -5.686980247497559, + "loss": 0.4286, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.398980617523193, + "rewards/margins": 1.2879998683929443, + "rewards/rejected": -5.686980247497559, + "sft_loss": 4.383788585662842, + "step": 4085 + }, + { + "epoch": 2.188994815186486, + "grad_norm": 17.55373016826154, + "learning_rate": 2.0638463518361033e-07, + "logits/chosen": -0.2608991265296936, + "logits/rejected": -0.11179272085428238, + "logps/chosen": -4.350119590759277, + "logps/rejected": -5.653628826141357, + "loss": 0.4321, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.350119590759277, + "rewards/margins": 1.303508996963501, + "rewards/rejected": -5.653628826141357, + "sft_loss": 4.3665666580200195, + "step": 4090 + }, + { + "epoch": 2.1916708479678877, + "grad_norm": 23.91389177081075, + "learning_rate": 2.0512546932750702e-07, + "logits/chosen": -0.2029423713684082, + "logits/rejected": -0.11459051072597504, + "logps/chosen": -4.565677642822266, + "logps/rejected": -5.735536575317383, + "loss": 0.4509, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.565677642822266, + "rewards/margins": 1.1698591709136963, + "rewards/rejected": -5.735536575317383, + "sft_loss": 4.558390140533447, + "step": 4095 + }, + { + "epoch": 2.194346880749289, + "grad_norm": 22.697274610095754, + "learning_rate": 2.0386916434539343e-07, + "logits/chosen": -0.14836429059505463, + "logits/rejected": -0.023218411952257156, + "logps/chosen": -4.4364728927612305, + "logps/rejected": -5.839684009552002, + "loss": 0.4072, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.4364728927612305, + "rewards/margins": 1.4032106399536133, + "rewards/rejected": -5.839684009552002, + "sft_loss": 4.519501686096191, + "step": 4100 + }, + { + "epoch": 2.1970229135306907, + "grad_norm": 22.896311624054537, + "learning_rate": 2.0261573242594627e-07, + "logits/chosen": -0.1370099037885666, + "logits/rejected": 0.014382824301719666, + "logps/chosen": -4.753531455993652, + "logps/rejected": -6.031103134155273, + "loss": 0.4672, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.753531455993652, + "rewards/margins": 1.2775717973709106, + "rewards/rejected": -6.031103134155273, + "sft_loss": 4.664409637451172, + "step": 4105 + }, + { + "epoch": 2.1996989463120924, + "grad_norm": 29.55597936412726, + "learning_rate": 2.0136518572996724e-07, + "logits/chosen": -0.16040173172950745, + "logits/rejected": 0.008886401541531086, + "logps/chosen": -4.453537940979004, + "logps/rejected": -5.899094104766846, + "loss": 0.4303, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.453537940979004, + "rewards/margins": 1.4455565214157104, + "rewards/rejected": -5.899094104766846, + "sft_loss": 4.534873962402344, + "step": 4110 + }, + { + "epoch": 2.202374979093494, + "grad_norm": 20.09663557576658, + "learning_rate": 2.0011753639026617e-07, + "logits/chosen": -0.12617313861846924, + "logits/rejected": -0.03719883784651756, + "logps/chosen": -4.5755133628845215, + "logps/rejected": -5.795089244842529, + "loss": 0.4644, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.5755133628845215, + "rewards/margins": 1.219576120376587, + "rewards/rejected": -5.795089244842529, + "sft_loss": 4.645591735839844, + "step": 4115 + }, + { + "epoch": 2.2050510118748954, + "grad_norm": 22.72420330168612, + "learning_rate": 1.988727965115421e-07, + "logits/chosen": -0.14607496559619904, + "logits/rejected": -0.07823510468006134, + "logps/chosen": -4.544275283813477, + "logps/rejected": -5.835035800933838, + "loss": 0.4423, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.544275283813477, + "rewards/margins": 1.2907607555389404, + "rewards/rejected": -5.835035800933838, + "sft_loss": 4.6323394775390625, + "step": 4120 + }, + { + "epoch": 2.207727044656297, + "grad_norm": 17.82196448474604, + "learning_rate": 1.9763097817026713e-07, + "logits/chosen": -0.19549353420734406, + "logits/rejected": -0.05719345808029175, + "logps/chosen": -4.4385786056518555, + "logps/rejected": -5.997511863708496, + "loss": 0.3972, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.4385786056518555, + "rewards/margins": 1.5589336156845093, + "rewards/rejected": -5.997511863708496, + "sft_loss": 4.5320329666137695, + "step": 4125 + }, + { + "epoch": 2.210403077437699, + "grad_norm": 16.45368871148943, + "learning_rate": 1.9639209341456796e-07, + "logits/chosen": -0.12866944074630737, + "logits/rejected": -0.03789632394909859, + "logps/chosen": -4.623317241668701, + "logps/rejected": -5.921300411224365, + "loss": 0.4717, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.623317241668701, + "rewards/margins": 1.297982931137085, + "rewards/rejected": -5.921300411224365, + "sft_loss": 4.7191081047058105, + "step": 4130 + }, + { + "epoch": 2.2130791102191, + "grad_norm": 17.071193680660553, + "learning_rate": 1.951561542641102e-07, + "logits/chosen": -0.09355298429727554, + "logits/rejected": -0.07170907407999039, + "logps/chosen": -4.602305889129639, + "logps/rejected": -5.94113826751709, + "loss": 0.487, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.602305889129639, + "rewards/margins": 1.3388313055038452, + "rewards/rejected": -5.94113826751709, + "sft_loss": 4.62455940246582, + "step": 4135 + }, + { + "epoch": 2.215755143000502, + "grad_norm": 20.772452374202366, + "learning_rate": 1.939231727099806e-07, + "logits/chosen": -0.2547750473022461, + "logits/rejected": -0.19936183094978333, + "logps/chosen": -4.52876615524292, + "logps/rejected": -5.710700988769531, + "loss": 0.4924, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.52876615524292, + "rewards/margins": 1.1819345951080322, + "rewards/rejected": -5.710700988769531, + "sft_loss": 4.539527416229248, + "step": 4140 + }, + { + "epoch": 2.2184311757819035, + "grad_norm": 20.46616081123614, + "learning_rate": 1.926931607145719e-07, + "logits/chosen": -0.10002921521663666, + "logits/rejected": 0.00514222402125597, + "logps/chosen": -4.70742130279541, + "logps/rejected": -5.912622928619385, + "loss": 0.4682, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.70742130279541, + "rewards/margins": 1.205202341079712, + "rewards/rejected": -5.912622928619385, + "sft_loss": 4.747741222381592, + "step": 4145 + }, + { + "epoch": 2.221107208563305, + "grad_norm": 16.643090117581625, + "learning_rate": 1.9146613021146564e-07, + "logits/chosen": -0.1663173884153366, + "logits/rejected": -0.08629349619150162, + "logps/chosen": -4.3680830001831055, + "logps/rejected": -5.625277042388916, + "loss": 0.4588, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.3680830001831055, + "rewards/margins": 1.2571938037872314, + "rewards/rejected": -5.625277042388916, + "sft_loss": 4.439862251281738, + "step": 4150 + }, + { + "epoch": 2.2237832413447065, + "grad_norm": 19.103701178808972, + "learning_rate": 1.9024209310531736e-07, + "logits/chosen": -0.13490690290927887, + "logits/rejected": -0.11030188947916031, + "logps/chosen": -4.417179107666016, + "logps/rejected": -5.694499969482422, + "loss": 0.4652, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.417179107666016, + "rewards/margins": 1.2773202657699585, + "rewards/rejected": -5.694499969482422, + "sft_loss": 4.395775318145752, + "step": 4155 + }, + { + "epoch": 2.2264592741261082, + "grad_norm": 18.744252464756403, + "learning_rate": 1.890210612717401e-07, + "logits/chosen": -0.1814108043909073, + "logits/rejected": -0.06273964792490005, + "logps/chosen": -4.500998497009277, + "logps/rejected": -5.8500285148620605, + "loss": 0.4362, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.500998497009277, + "rewards/margins": 1.3490300178527832, + "rewards/rejected": -5.8500285148620605, + "sft_loss": 4.531396389007568, + "step": 4160 + }, + { + "epoch": 2.2291353069075095, + "grad_norm": 23.09120396450256, + "learning_rate": 1.8780304655719054e-07, + "logits/chosen": -0.1780509650707245, + "logits/rejected": -0.0687517300248146, + "logps/chosen": -4.513307094573975, + "logps/rejected": -5.991667747497559, + "loss": 0.4279, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.513307094573975, + "rewards/margins": 1.4783604145050049, + "rewards/rejected": -5.991667747497559, + "sft_loss": 4.592909336090088, + "step": 4165 + }, + { + "epoch": 2.231811339688911, + "grad_norm": 29.263796253204035, + "learning_rate": 1.865880607788523e-07, + "logits/chosen": -0.04930751398205757, + "logits/rejected": 0.015498518943786621, + "logps/chosen": -4.494831562042236, + "logps/rejected": -5.8351731300354, + "loss": 0.4457, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.494831562042236, + "rewards/margins": 1.340341567993164, + "rewards/rejected": -5.8351731300354, + "sft_loss": 4.675837516784668, + "step": 4170 + }, + { + "epoch": 2.234487372470313, + "grad_norm": 26.915621545601567, + "learning_rate": 1.8537611572452316e-07, + "logits/chosen": -0.15022674202919006, + "logits/rejected": -0.0800619125366211, + "logps/chosen": -4.476630210876465, + "logps/rejected": -5.6053466796875, + "loss": 0.4733, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.476630210876465, + "rewards/margins": 1.1287164688110352, + "rewards/rejected": -5.6053466796875, + "sft_loss": 4.568302154541016, + "step": 4175 + }, + { + "epoch": 2.237163405251714, + "grad_norm": 19.529674888613087, + "learning_rate": 1.84167223152499e-07, + "logits/chosen": -0.18381746113300323, + "logits/rejected": -0.043142445385456085, + "logps/chosen": -4.506960391998291, + "logps/rejected": -5.8851518630981445, + "loss": 0.428, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.506960391998291, + "rewards/margins": 1.3781912326812744, + "rewards/rejected": -5.8851518630981445, + "sft_loss": 4.607504844665527, + "step": 4180 + }, + { + "epoch": 2.239839438033116, + "grad_norm": 25.227913751490263, + "learning_rate": 1.8296139479146112e-07, + "logits/chosen": -0.21912606060504913, + "logits/rejected": -0.15963666141033173, + "logps/chosen": -4.3517045974731445, + "logps/rejected": -5.7596116065979, + "loss": 0.4374, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.3517045974731445, + "rewards/margins": 1.407906413078308, + "rewards/rejected": -5.7596116065979, + "sft_loss": 4.369559288024902, + "step": 4185 + }, + { + "epoch": 2.2425154708145176, + "grad_norm": 19.094078156725242, + "learning_rate": 1.8175864234036132e-07, + "logits/chosen": -0.09747710078954697, + "logits/rejected": -0.04323968663811684, + "logps/chosen": -4.51051139831543, + "logps/rejected": -5.821291446685791, + "loss": 0.4778, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.51051139831543, + "rewards/margins": 1.3107802867889404, + "rewards/rejected": -5.821291446685791, + "sft_loss": 4.546077251434326, + "step": 4190 + }, + { + "epoch": 2.245191503595919, + "grad_norm": 17.462313735653243, + "learning_rate": 1.805589774683094e-07, + "logits/chosen": -0.2526007294654846, + "logits/rejected": -0.14061518013477325, + "logps/chosen": -4.462028980255127, + "logps/rejected": -5.620845317840576, + "loss": 0.4595, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.462028980255127, + "rewards/margins": 1.158816933631897, + "rewards/rejected": -5.620845317840576, + "sft_loss": 4.531667232513428, + "step": 4195 + }, + { + "epoch": 2.2478675363773206, + "grad_norm": 21.330268716607183, + "learning_rate": 1.79362411814459e-07, + "logits/chosen": -0.09234540909528732, + "logits/rejected": -0.11624608933925629, + "logps/chosen": -4.66225528717041, + "logps/rejected": -5.735785961151123, + "loss": 0.5142, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.66225528717041, + "rewards/margins": 1.0735304355621338, + "rewards/rejected": -5.735785961151123, + "sft_loss": 4.689001560211182, + "step": 4200 + }, + { + "epoch": 2.2505435691587223, + "grad_norm": 18.736173702460214, + "learning_rate": 1.7816895698789552e-07, + "logits/chosen": -0.230657696723938, + "logits/rejected": -0.13760152459144592, + "logps/chosen": -4.450052261352539, + "logps/rejected": -5.639276504516602, + "loss": 0.4645, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.450052261352539, + "rewards/margins": 1.1892242431640625, + "rewards/rejected": -5.639276504516602, + "sft_loss": 4.513615608215332, + "step": 4205 + }, + { + "epoch": 2.2532196019401236, + "grad_norm": 22.746472066636194, + "learning_rate": 1.7697862456752271e-07, + "logits/chosen": -0.2065122127532959, + "logits/rejected": -0.09461875259876251, + "logps/chosen": -4.567070960998535, + "logps/rejected": -6.106113433837891, + "loss": 0.4233, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.567070960998535, + "rewards/margins": 1.539042592048645, + "rewards/rejected": -6.106113433837891, + "sft_loss": 4.609532356262207, + "step": 4210 + }, + { + "epoch": 2.2558956347215253, + "grad_norm": 21.822458722543207, + "learning_rate": 1.7579142610195124e-07, + "logits/chosen": -0.17818565666675568, + "logits/rejected": -0.043703652918338776, + "logps/chosen": -4.625606536865234, + "logps/rejected": -5.955820560455322, + "loss": 0.463, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.625606536865234, + "rewards/margins": 1.3302134275436401, + "rewards/rejected": -5.955820560455322, + "sft_loss": 4.593493461608887, + "step": 4215 + }, + { + "epoch": 2.258571667502927, + "grad_norm": 25.2782048393693, + "learning_rate": 1.7460737310938568e-07, + "logits/chosen": -0.1916409730911255, + "logits/rejected": -0.017262551933526993, + "logps/chosen": -4.424893379211426, + "logps/rejected": -5.842418670654297, + "loss": 0.4256, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.424893379211426, + "rewards/margins": 1.417525291442871, + "rewards/rejected": -5.842418670654297, + "sft_loss": 4.535680770874023, + "step": 4220 + }, + { + "epoch": 2.2612477002843283, + "grad_norm": 19.26626818673393, + "learning_rate": 1.734264770775133e-07, + "logits/chosen": -0.20236441493034363, + "logits/rejected": -0.03545919433236122, + "logps/chosen": -4.511693477630615, + "logps/rejected": -5.835480213165283, + "loss": 0.4593, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.511693477630615, + "rewards/margins": 1.3237863779067993, + "rewards/rejected": -5.835480213165283, + "sft_loss": 4.541731834411621, + "step": 4225 + }, + { + "epoch": 2.26392373306573, + "grad_norm": 23.875300074782405, + "learning_rate": 1.7224874946339241e-07, + "logits/chosen": -0.1965745985507965, + "logits/rejected": -0.142036572098732, + "logps/chosen": -4.52227783203125, + "logps/rejected": -5.851922035217285, + "loss": 0.4728, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.52227783203125, + "rewards/margins": 1.3296445608139038, + "rewards/rejected": -5.851922035217285, + "sft_loss": 4.446340560913086, + "step": 4230 + }, + { + "epoch": 2.2665997658471317, + "grad_norm": 16.56329987288688, + "learning_rate": 1.7107420169334186e-07, + "logits/chosen": -0.1566283404827118, + "logits/rejected": -0.08194781839847565, + "logps/chosen": -4.629245758056641, + "logps/rejected": -5.892148971557617, + "loss": 0.4698, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.629245758056641, + "rewards/margins": 1.2629032135009766, + "rewards/rejected": -5.892148971557617, + "sft_loss": 4.6787190437316895, + "step": 4235 + }, + { + "epoch": 2.269275798628533, + "grad_norm": 18.07301865977798, + "learning_rate": 1.6990284516282893e-07, + "logits/chosen": -0.1772286742925644, + "logits/rejected": -0.08047084510326385, + "logps/chosen": -4.423053741455078, + "logps/rejected": -5.777717590332031, + "loss": 0.4354, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.423053741455078, + "rewards/margins": 1.3546632528305054, + "rewards/rejected": -5.777717590332031, + "sft_loss": 4.448493003845215, + "step": 4240 + }, + { + "epoch": 2.2719518314099347, + "grad_norm": 22.619828464116434, + "learning_rate": 1.687346912363602e-07, + "logits/chosen": -0.18942368030548096, + "logits/rejected": -0.08072742819786072, + "logps/chosen": -4.497479438781738, + "logps/rejected": -5.813804626464844, + "loss": 0.4447, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.497479438781738, + "rewards/margins": 1.3163255453109741, + "rewards/rejected": -5.813804626464844, + "sft_loss": 4.514906406402588, + "step": 4245 + }, + { + "epoch": 2.2746278641913364, + "grad_norm": 16.656153461239374, + "learning_rate": 1.675697512473697e-07, + "logits/chosen": -0.20577137172222137, + "logits/rejected": -0.05031289905309677, + "logps/chosen": -4.567205905914307, + "logps/rejected": -5.930844306945801, + "loss": 0.414, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.567205905914307, + "rewards/margins": 1.3636387586593628, + "rewards/rejected": -5.930844306945801, + "sft_loss": 4.56335973739624, + "step": 4250 + }, + { + "epoch": 2.2773038969727377, + "grad_norm": 22.169750281058562, + "learning_rate": 1.6640803649811087e-07, + "logits/chosen": -0.18401619791984558, + "logits/rejected": -0.0024409503675997257, + "logps/chosen": -4.638046741485596, + "logps/rejected": -6.121177673339844, + "loss": 0.4219, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.638046741485596, + "rewards/margins": 1.483130931854248, + "rewards/rejected": -6.121177673339844, + "sft_loss": 4.603224277496338, + "step": 4255 + }, + { + "epoch": 2.2799799297541394, + "grad_norm": 25.450642911331816, + "learning_rate": 1.6524955825954472e-07, + "logits/chosen": -0.13436515629291534, + "logits/rejected": -0.054591696709394455, + "logps/chosen": -4.657012939453125, + "logps/rejected": -5.924901485443115, + "loss": 0.4568, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.657012939453125, + "rewards/margins": 1.2678885459899902, + "rewards/rejected": -5.924901485443115, + "sft_loss": 4.595439434051514, + "step": 4260 + }, + { + "epoch": 2.282655962535541, + "grad_norm": 15.873932160962498, + "learning_rate": 1.6409432777123277e-07, + "logits/chosen": -0.20072786509990692, + "logits/rejected": -0.07589218020439148, + "logps/chosen": -4.668331623077393, + "logps/rejected": -6.191826820373535, + "loss": 0.4371, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.668331623077393, + "rewards/margins": 1.5234956741333008, + "rewards/rejected": -6.191826820373535, + "sft_loss": 4.7383832931518555, + "step": 4265 + }, + { + "epoch": 2.285331995316943, + "grad_norm": 19.993588406867985, + "learning_rate": 1.6294235624122577e-07, + "logits/chosen": -0.12873494625091553, + "logits/rejected": 0.03537944331765175, + "logps/chosen": -4.6926655769348145, + "logps/rejected": -6.041623115539551, + "loss": 0.4639, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.6926655769348145, + "rewards/margins": 1.3489577770233154, + "rewards/rejected": -6.041623115539551, + "sft_loss": 4.652834415435791, + "step": 4270 + }, + { + "epoch": 2.288008028098344, + "grad_norm": 19.275439911848228, + "learning_rate": 1.6179365484595697e-07, + "logits/chosen": -0.18791356682777405, + "logits/rejected": -0.08327902108430862, + "logps/chosen": -4.731060028076172, + "logps/rejected": -6.04015588760376, + "loss": 0.4714, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.731060028076172, + "rewards/margins": 1.3090959787368774, + "rewards/rejected": -6.04015588760376, + "sft_loss": 4.791726112365723, + "step": 4275 + }, + { + "epoch": 2.290684060879746, + "grad_norm": 21.308120586026025, + "learning_rate": 1.60648234730132e-07, + "logits/chosen": -0.1869659125804901, + "logits/rejected": -0.11178477108478546, + "logps/chosen": -4.531894683837891, + "logps/rejected": -5.969760417938232, + "loss": 0.4136, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.531894683837891, + "rewards/margins": 1.437865972518921, + "rewards/rejected": -5.969760417938232, + "sft_loss": 4.582188606262207, + "step": 4280 + }, + { + "epoch": 2.293360093661147, + "grad_norm": 21.74581959109812, + "learning_rate": 1.595061070066222e-07, + "logits/chosen": -0.1504012644290924, + "logits/rejected": -0.1195465475320816, + "logps/chosen": -4.655195236206055, + "logps/rejected": -6.071804046630859, + "loss": 0.418, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.655195236206055, + "rewards/margins": 1.416608452796936, + "rewards/rejected": -6.071804046630859, + "sft_loss": 4.6643500328063965, + "step": 4285 + }, + { + "epoch": 2.296036126442549, + "grad_norm": 32.26578170381865, + "learning_rate": 1.5836728275635542e-07, + "logits/chosen": -0.220480278134346, + "logits/rejected": -0.10281282663345337, + "logps/chosen": -4.7849626541137695, + "logps/rejected": -6.028841495513916, + "loss": 0.4864, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.7849626541137695, + "rewards/margins": 1.2438790798187256, + "rewards/rejected": -6.028841495513916, + "sft_loss": 4.771364212036133, + "step": 4290 + }, + { + "epoch": 2.2987121592239506, + "grad_norm": 19.27299726224707, + "learning_rate": 1.5723177302820984e-07, + "logits/chosen": -0.19079093635082245, + "logits/rejected": -0.11726488918066025, + "logps/chosen": -4.7149457931518555, + "logps/rejected": -5.869764804840088, + "loss": 0.4676, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.7149457931518555, + "rewards/margins": 1.1548184156417847, + "rewards/rejected": -5.869764804840088, + "sft_loss": 4.712212562561035, + "step": 4295 + }, + { + "epoch": 2.3013881920053523, + "grad_norm": 19.365753855559927, + "learning_rate": 1.5609958883890544e-07, + "logits/chosen": -0.14444835484027863, + "logits/rejected": -0.035151056945323944, + "logps/chosen": -4.563656806945801, + "logps/rejected": -5.817429065704346, + "loss": 0.438, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.563656806945801, + "rewards/margins": 1.2537721395492554, + "rewards/rejected": -5.817429065704346, + "sft_loss": 4.519850254058838, + "step": 4300 + }, + { + "epoch": 2.3040642247867535, + "grad_norm": 23.970306343855015, + "learning_rate": 1.5497074117289865e-07, + "logits/chosen": -0.24623043835163116, + "logits/rejected": -0.14050698280334473, + "logps/chosen": -4.51409387588501, + "logps/rejected": -5.959392070770264, + "loss": 0.431, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.51409387588501, + "rewards/margins": 1.4452987909317017, + "rewards/rejected": -5.959392070770264, + "sft_loss": 4.649843692779541, + "step": 4305 + }, + { + "epoch": 2.3067402575681553, + "grad_norm": 19.254904924716207, + "learning_rate": 1.5384524098227402e-07, + "logits/chosen": -0.22397704422473907, + "logits/rejected": -0.08937518298625946, + "logps/chosen": -4.677164554595947, + "logps/rejected": -6.227524757385254, + "loss": 0.3844, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.677164554595947, + "rewards/margins": 1.5503594875335693, + "rewards/rejected": -6.227524757385254, + "sft_loss": 4.709841251373291, + "step": 4310 + }, + { + "epoch": 2.3094162903495565, + "grad_norm": 24.604712868309882, + "learning_rate": 1.5272309918663974e-07, + "logits/chosen": -0.19657690823078156, + "logits/rejected": -0.07223434746265411, + "logps/chosen": -4.633805274963379, + "logps/rejected": -5.77435827255249, + "loss": 0.5182, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.633805274963379, + "rewards/margins": 1.1405527591705322, + "rewards/rejected": -5.77435827255249, + "sft_loss": 4.767401695251465, + "step": 4315 + }, + { + "epoch": 2.3120923231309582, + "grad_norm": 19.346911823012697, + "learning_rate": 1.516043266730201e-07, + "logits/chosen": -0.2063641995191574, + "logits/rejected": -0.09147273004055023, + "logps/chosen": -4.624153137207031, + "logps/rejected": -5.970458030700684, + "loss": 0.429, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.624153137207031, + "rewards/margins": 1.3463048934936523, + "rewards/rejected": -5.970458030700684, + "sft_loss": 4.563480377197266, + "step": 4320 + }, + { + "epoch": 2.31476835591236, + "grad_norm": 29.70356859619958, + "learning_rate": 1.504889342957512e-07, + "logits/chosen": -0.18906475603580475, + "logits/rejected": -0.07552754133939743, + "logps/chosen": -4.602474689483643, + "logps/rejected": -5.924898624420166, + "loss": 0.498, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.602474689483643, + "rewards/margins": 1.3224231004714966, + "rewards/rejected": -5.924898624420166, + "sft_loss": 4.635772705078125, + "step": 4325 + }, + { + "epoch": 2.3174443886937617, + "grad_norm": 20.36618070960126, + "learning_rate": 1.4937693287637453e-07, + "logits/chosen": -0.18279999494552612, + "logits/rejected": -0.06919268518686295, + "logps/chosen": -4.600367546081543, + "logps/rejected": -5.84440803527832, + "loss": 0.4585, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.600367546081543, + "rewards/margins": 1.2440404891967773, + "rewards/rejected": -5.84440803527832, + "sft_loss": 4.5165205001831055, + "step": 4330 + }, + { + "epoch": 2.320120421475163, + "grad_norm": 21.237739192090743, + "learning_rate": 1.4826833320353305e-07, + "logits/chosen": -0.1806211918592453, + "logits/rejected": -0.10852392017841339, + "logps/chosen": -4.569619655609131, + "logps/rejected": -5.892071723937988, + "loss": 0.438, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.569619655609131, + "rewards/margins": 1.3224519491195679, + "rewards/rejected": -5.892071723937988, + "sft_loss": 4.4747748374938965, + "step": 4335 + }, + { + "epoch": 2.3227964542565647, + "grad_norm": 21.58619347687164, + "learning_rate": 1.4716314603286528e-07, + "logits/chosen": -0.21733203530311584, + "logits/rejected": -0.08858387172222137, + "logps/chosen": -4.599829196929932, + "logps/rejected": -6.069632053375244, + "loss": 0.4195, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.599829196929932, + "rewards/margins": 1.4698026180267334, + "rewards/rejected": -6.069632053375244, + "sft_loss": 4.639847278594971, + "step": 4340 + }, + { + "epoch": 2.3254724870379664, + "grad_norm": 40.36680316737448, + "learning_rate": 1.4606138208690233e-07, + "logits/chosen": -0.20439603924751282, + "logits/rejected": -0.11707202345132828, + "logps/chosen": -4.695111274719238, + "logps/rejected": -5.897182464599609, + "loss": 0.5233, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.695111274719238, + "rewards/margins": 1.2020713090896606, + "rewards/rejected": -5.897182464599609, + "sft_loss": 4.595641136169434, + "step": 4345 + }, + { + "epoch": 2.3281485198193677, + "grad_norm": 18.09753407104881, + "learning_rate": 1.4496305205496251e-07, + "logits/chosen": -0.17222526669502258, + "logits/rejected": -0.0842718854546547, + "logps/chosen": -4.666041374206543, + "logps/rejected": -6.027068614959717, + "loss": 0.4485, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.666041374206543, + "rewards/margins": 1.361027479171753, + "rewards/rejected": -6.027068614959717, + "sft_loss": 4.675548553466797, + "step": 4350 + }, + { + "epoch": 2.3308245526007694, + "grad_norm": 16.052846367611284, + "learning_rate": 1.4386816659304895e-07, + "logits/chosen": -0.2250470370054245, + "logits/rejected": -0.11605800688266754, + "logps/chosen": -4.556109428405762, + "logps/rejected": -5.891709327697754, + "loss": 0.4295, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.556109428405762, + "rewards/margins": 1.33560049533844, + "rewards/rejected": -5.891709327697754, + "sft_loss": 4.656196594238281, + "step": 4355 + }, + { + "epoch": 2.333500585382171, + "grad_norm": 21.862687536384428, + "learning_rate": 1.4277673632374492e-07, + "logits/chosen": -0.25336316227912903, + "logits/rejected": -0.09333156794309616, + "logps/chosen": -4.592108726501465, + "logps/rejected": -5.923739433288574, + "loss": 0.4437, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.592108726501465, + "rewards/margins": 1.3316295146942139, + "rewards/rejected": -5.923739433288574, + "sft_loss": 4.607272148132324, + "step": 4360 + }, + { + "epoch": 2.3361766181635724, + "grad_norm": 19.63578607815804, + "learning_rate": 1.416887718361119e-07, + "logits/chosen": -0.13836193084716797, + "logits/rejected": -0.10894973576068878, + "logps/chosen": -4.5957818031311035, + "logps/rejected": -5.848198890686035, + "loss": 0.4655, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.5957818031311035, + "rewards/margins": 1.2524168491363525, + "rewards/rejected": -5.848198890686035, + "sft_loss": 4.647026062011719, + "step": 4365 + }, + { + "epoch": 2.338852650944974, + "grad_norm": 27.572479225734174, + "learning_rate": 1.406042836855859e-07, + "logits/chosen": -0.12179931253194809, + "logits/rejected": -0.0321027971804142, + "logps/chosen": -4.529701232910156, + "logps/rejected": -6.020511627197266, + "loss": 0.4097, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.529701232910156, + "rewards/margins": 1.4908112287521362, + "rewards/rejected": -6.020511627197266, + "sft_loss": 4.580036163330078, + "step": 4370 + }, + { + "epoch": 2.341528683726376, + "grad_norm": 24.36795285820144, + "learning_rate": 1.3952328239387595e-07, + "logits/chosen": -0.26527541875839233, + "logits/rejected": -0.08893623948097229, + "logps/chosen": -4.502415657043457, + "logps/rejected": -5.973431587219238, + "loss": 0.4135, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.502415657043457, + "rewards/margins": 1.4710155725479126, + "rewards/rejected": -5.973431587219238, + "sft_loss": 4.644287109375, + "step": 4375 + }, + { + "epoch": 2.344204716507777, + "grad_norm": 20.54890891777198, + "learning_rate": 1.3844577844886109e-07, + "logits/chosen": -0.20819087326526642, + "logits/rejected": -0.03349591791629791, + "logps/chosen": -4.512115478515625, + "logps/rejected": -5.900969982147217, + "loss": 0.453, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.512115478515625, + "rewards/margins": 1.3888546228408813, + "rewards/rejected": -5.900969982147217, + "sft_loss": 4.571647644042969, + "step": 4380 + }, + { + "epoch": 2.346880749289179, + "grad_norm": 23.865875017997862, + "learning_rate": 1.3737178230448955e-07, + "logits/chosen": -0.24579854309558868, + "logits/rejected": -0.1403156965970993, + "logps/chosen": -4.625274658203125, + "logps/rejected": -5.83488655090332, + "loss": 0.4773, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.625274658203125, + "rewards/margins": 1.2096123695373535, + "rewards/rejected": -5.83488655090332, + "sft_loss": 4.556929111480713, + "step": 4385 + }, + { + "epoch": 2.3495567820705805, + "grad_norm": 16.577622925228273, + "learning_rate": 1.363013043806764e-07, + "logits/chosen": -0.19563975930213928, + "logits/rejected": -0.08206583559513092, + "logps/chosen": -4.448563575744629, + "logps/rejected": -5.7410783767700195, + "loss": 0.44, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.448563575744629, + "rewards/margins": 1.2925150394439697, + "rewards/rejected": -5.7410783767700195, + "sft_loss": 4.540297031402588, + "step": 4390 + }, + { + "epoch": 2.3522328148519818, + "grad_norm": 19.63967959827264, + "learning_rate": 1.352343550632034e-07, + "logits/chosen": -0.17274947464466095, + "logits/rejected": -0.05672701448202133, + "logps/chosen": -4.5420379638671875, + "logps/rejected": -6.012210845947266, + "loss": 0.4476, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.5420379638671875, + "rewards/margins": 1.4701731204986572, + "rewards/rejected": -6.012210845947266, + "sft_loss": 4.543997764587402, + "step": 4395 + }, + { + "epoch": 2.3549088476333835, + "grad_norm": 18.225681233142744, + "learning_rate": 1.3417094470361722e-07, + "logits/chosen": -0.19474905729293823, + "logits/rejected": -0.0927257388830185, + "logps/chosen": -4.614598274230957, + "logps/rejected": -5.783566474914551, + "loss": 0.4844, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.614598274230957, + "rewards/margins": 1.1689684391021729, + "rewards/rejected": -5.783566474914551, + "sft_loss": 4.706032752990723, + "step": 4400 + }, + { + "epoch": 2.3549088476333835, + "eval_logits/chosen": 0.06405708938837051, + "eval_logits/rejected": 0.1450890153646469, + "eval_logps/chosen": -4.723513126373291, + "eval_logps/rejected": -5.77617883682251, + "eval_loss": 0.5826747417449951, + "eval_rewards/accuracies": 0.7314540147781372, + "eval_rewards/chosen": -4.723513126373291, + "eval_rewards/margins": 1.0526658296585083, + "eval_rewards/rejected": -5.77617883682251, + "eval_runtime": 43.1706, + "eval_samples_per_second": 31.155, + "eval_sft_loss": 4.6692376136779785, + "eval_steps_per_second": 7.806, + "step": 4400 + }, + { + "epoch": 2.357584880414785, + "grad_norm": 21.315599162846883, + "learning_rate": 1.3311108361913015e-07, + "logits/chosen": -0.2303951233625412, + "logits/rejected": -0.16923975944519043, + "logps/chosen": -4.504847049713135, + "logps/rejected": -5.79945182800293, + "loss": 0.4302, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.504847049713135, + "rewards/margins": 1.2946048974990845, + "rewards/rejected": -5.79945182800293, + "sft_loss": 4.5163960456848145, + "step": 4405 + }, + { + "epoch": 2.3602609131961865, + "grad_norm": 17.0429736757707, + "learning_rate": 1.3205478209251874e-07, + "logits/chosen": -0.15470165014266968, + "logits/rejected": -0.044807516038417816, + "logps/chosen": -4.7473931312561035, + "logps/rejected": -6.170556545257568, + "loss": 0.4241, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.7473931312561035, + "rewards/margins": 1.423163890838623, + "rewards/rejected": -6.170556545257568, + "sft_loss": 4.7713727951049805, + "step": 4410 + }, + { + "epoch": 2.362936945977588, + "grad_norm": 19.303030539992708, + "learning_rate": 1.310020503720254e-07, + "logits/chosen": -0.1884719729423523, + "logits/rejected": -0.06602375209331512, + "logps/chosen": -4.579176425933838, + "logps/rejected": -5.957730293273926, + "loss": 0.4416, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.579176425933838, + "rewards/margins": 1.3785539865493774, + "rewards/rejected": -5.957730293273926, + "sft_loss": 4.538491249084473, + "step": 4415 + }, + { + "epoch": 2.36561297875899, + "grad_norm": 25.84773812381602, + "learning_rate": 1.2995289867125752e-07, + "logits/chosen": -0.17406593263149261, + "logits/rejected": -0.09588642418384552, + "logps/chosen": -4.64821720123291, + "logps/rejected": -5.7471160888671875, + "loss": 0.4927, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.64821720123291, + "rewards/margins": 1.0988987684249878, + "rewards/rejected": -5.7471160888671875, + "sft_loss": 4.654828071594238, + "step": 4420 + }, + { + "epoch": 2.368289011540391, + "grad_norm": 15.604304686195245, + "learning_rate": 1.2890733716908986e-07, + "logits/chosen": -0.1851363182067871, + "logits/rejected": -0.08538545668125153, + "logps/chosen": -4.357602596282959, + "logps/rejected": -5.7189836502075195, + "loss": 0.3819, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.357602596282959, + "rewards/margins": 1.361380934715271, + "rewards/rejected": -5.7189836502075195, + "sft_loss": 4.434847354888916, + "step": 4425 + }, + { + "epoch": 2.370965044321793, + "grad_norm": 22.95940029833988, + "learning_rate": 1.2786537600956454e-07, + "logits/chosen": -0.21504803001880646, + "logits/rejected": -0.09006574004888535, + "logps/chosen": -4.535211563110352, + "logps/rejected": -5.892853736877441, + "loss": 0.4556, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.535211563110352, + "rewards/margins": 1.3576418161392212, + "rewards/rejected": -5.892853736877441, + "sft_loss": 4.4517669677734375, + "step": 4430 + }, + { + "epoch": 2.3736410771031946, + "grad_norm": 16.778468087674916, + "learning_rate": 1.268270253017933e-07, + "logits/chosen": -0.2044660598039627, + "logits/rejected": -0.057089339941740036, + "logps/chosen": -4.5894551277160645, + "logps/rejected": -5.9368085861206055, + "loss": 0.4431, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.5894551277160645, + "rewards/margins": 1.3473538160324097, + "rewards/rejected": -5.9368085861206055, + "sft_loss": 4.624999046325684, + "step": 4435 + }, + { + "epoch": 2.376317109884596, + "grad_norm": 18.588050354393058, + "learning_rate": 1.257922951198591e-07, + "logits/chosen": -0.2678273022174835, + "logits/rejected": -0.06287500262260437, + "logps/chosen": -4.446804523468018, + "logps/rejected": -5.7586870193481445, + "loss": 0.4534, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.446804523468018, + "rewards/margins": 1.3118834495544434, + "rewards/rejected": -5.7586870193481445, + "sft_loss": 4.468583583831787, + "step": 4440 + }, + { + "epoch": 2.3789931426659976, + "grad_norm": 22.527000565403007, + "learning_rate": 1.24761195502719e-07, + "logits/chosen": -0.21276633441448212, + "logits/rejected": -0.05496319383382797, + "logps/chosen": -4.646910667419434, + "logps/rejected": -5.7714104652404785, + "loss": 0.5162, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.646910667419434, + "rewards/margins": 1.124500036239624, + "rewards/rejected": -5.7714104652404785, + "sft_loss": 4.6487135887146, + "step": 4445 + }, + { + "epoch": 2.3816691754473993, + "grad_norm": 28.147353828645194, + "learning_rate": 1.2373373645410573e-07, + "logits/chosen": -0.1935139149427414, + "logits/rejected": -0.07589694857597351, + "logps/chosen": -4.688645362854004, + "logps/rejected": -6.109697341918945, + "loss": 0.4635, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.688645362854004, + "rewards/margins": 1.4210526943206787, + "rewards/rejected": -6.109697341918945, + "sft_loss": 4.691089630126953, + "step": 4450 + }, + { + "epoch": 2.384345208228801, + "grad_norm": 20.696087353859202, + "learning_rate": 1.2270992794243175e-07, + "logits/chosen": -0.2511764168739319, + "logits/rejected": -0.1509692221879959, + "logps/chosen": -4.562219619750977, + "logps/rejected": -5.868638038635254, + "loss": 0.4661, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.562219619750977, + "rewards/margins": 1.3064180612564087, + "rewards/rejected": -5.868638038635254, + "sft_loss": 4.586970329284668, + "step": 4455 + }, + { + "epoch": 2.3870212410102023, + "grad_norm": 15.486007590775456, + "learning_rate": 1.2168977990069147e-07, + "logits/chosen": -0.23120412230491638, + "logits/rejected": -0.0854119285941124, + "logps/chosen": -4.435896396636963, + "logps/rejected": -5.71600341796875, + "loss": 0.4603, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.435896396636963, + "rewards/margins": 1.2801072597503662, + "rewards/rejected": -5.71600341796875, + "sft_loss": 4.478748321533203, + "step": 4460 + }, + { + "epoch": 2.389697273791604, + "grad_norm": 20.515836091470668, + "learning_rate": 1.206733022263659e-07, + "logits/chosen": -0.2171090543270111, + "logits/rejected": -0.044152624905109406, + "logps/chosen": -4.647055149078369, + "logps/rejected": -5.933724880218506, + "loss": 0.4803, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.647055149078369, + "rewards/margins": 1.2866696119308472, + "rewards/rejected": -5.933724880218506, + "sft_loss": 4.59279727935791, + "step": 4465 + }, + { + "epoch": 2.3923733065730053, + "grad_norm": 20.383111324167274, + "learning_rate": 1.1966050478132572e-07, + "logits/chosen": -0.15860562026500702, + "logits/rejected": -0.07073559612035751, + "logps/chosen": -4.495818138122559, + "logps/rejected": -5.804797649383545, + "loss": 0.4818, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.495818138122559, + "rewards/margins": 1.3089797496795654, + "rewards/rejected": -5.804797649383545, + "sft_loss": 4.603878974914551, + "step": 4470 + }, + { + "epoch": 2.395049339354407, + "grad_norm": 20.539838960506497, + "learning_rate": 1.1865139739173635e-07, + "logits/chosen": -0.21058157086372375, + "logits/rejected": -0.04740305244922638, + "logps/chosen": -4.5614752769470215, + "logps/rejected": -5.795866966247559, + "loss": 0.4481, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.5614752769470215, + "rewards/margins": 1.234391450881958, + "rewards/rejected": -5.795866966247559, + "sft_loss": 4.5146684646606445, + "step": 4475 + }, + { + "epoch": 2.3977253721358087, + "grad_norm": 24.234417451454213, + "learning_rate": 1.1764598984796187e-07, + "logits/chosen": -0.21933972835540771, + "logits/rejected": -0.10762651264667511, + "logps/chosen": -4.50011682510376, + "logps/rejected": -5.7032012939453125, + "loss": 0.4548, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.50011682510376, + "rewards/margins": 1.2030847072601318, + "rewards/rejected": -5.7032012939453125, + "sft_loss": 4.520796775817871, + "step": 4480 + }, + { + "epoch": 2.4004014049172104, + "grad_norm": 25.196603293763232, + "learning_rate": 1.1664429190447095e-07, + "logits/chosen": -0.18634046614170074, + "logits/rejected": -0.10818295180797577, + "logps/chosen": -4.60623836517334, + "logps/rejected": -5.995389938354492, + "loss": 0.4354, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.60623836517334, + "rewards/margins": 1.389150619506836, + "rewards/rejected": -5.995389938354492, + "sft_loss": 4.585078239440918, + "step": 4485 + }, + { + "epoch": 2.4030774376986117, + "grad_norm": 29.397343565816843, + "learning_rate": 1.1564631327974122e-07, + "logits/chosen": -0.20830197632312775, + "logits/rejected": -0.05398184061050415, + "logps/chosen": -4.6167826652526855, + "logps/rejected": -5.977439880371094, + "loss": 0.4562, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.6167826652526855, + "rewards/margins": 1.3606570959091187, + "rewards/rejected": -5.977439880371094, + "sft_loss": 4.660190582275391, + "step": 4490 + }, + { + "epoch": 2.4057534704800134, + "grad_norm": 20.32175487272762, + "learning_rate": 1.1465206365616587e-07, + "logits/chosen": -0.29008620977401733, + "logits/rejected": -0.134123295545578, + "logps/chosen": -4.646373271942139, + "logps/rejected": -5.834511756896973, + "loss": 0.4876, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.646373271942139, + "rewards/margins": 1.1881383657455444, + "rewards/rejected": -5.834511756896973, + "sft_loss": 4.637969017028809, + "step": 4495 + }, + { + "epoch": 2.408429503261415, + "grad_norm": 20.418388075845467, + "learning_rate": 1.1366155267995887e-07, + "logits/chosen": -0.15654954314231873, + "logits/rejected": -0.11684336513280869, + "logps/chosen": -4.565447807312012, + "logps/rejected": -5.900856971740723, + "loss": 0.4304, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.565447807312012, + "rewards/margins": 1.3354084491729736, + "rewards/rejected": -5.900856971740723, + "sft_loss": 4.608338356018066, + "step": 4500 + }, + { + "epoch": 2.4111055360428164, + "grad_norm": 19.032283203730458, + "learning_rate": 1.1267478996106228e-07, + "logits/chosen": -0.17595365643501282, + "logits/rejected": -0.025563359260559082, + "logps/chosen": -4.589142322540283, + "logps/rejected": -5.8382744789123535, + "loss": 0.4695, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.589142322540283, + "rewards/margins": 1.2491323947906494, + "rewards/rejected": -5.8382744789123535, + "sft_loss": 4.566729545593262, + "step": 4505 + }, + { + "epoch": 2.413781568824218, + "grad_norm": 19.68571244852352, + "learning_rate": 1.116917850730521e-07, + "logits/chosen": -0.207304909825325, + "logits/rejected": -0.10162524878978729, + "logps/chosen": -4.6496076583862305, + "logps/rejected": -5.775534629821777, + "loss": 0.5329, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.6496076583862305, + "rewards/margins": 1.1259268522262573, + "rewards/rejected": -5.775534629821777, + "sft_loss": 4.658164024353027, + "step": 4510 + }, + { + "epoch": 2.41645760160562, + "grad_norm": 19.34129821855337, + "learning_rate": 1.1071254755304637e-07, + "logits/chosen": -0.21180382370948792, + "logits/rejected": -0.15992474555969238, + "logps/chosen": -4.42584228515625, + "logps/rejected": -5.605154514312744, + "loss": 0.4819, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.42584228515625, + "rewards/margins": 1.1793123483657837, + "rewards/rejected": -5.605154514312744, + "sft_loss": 4.382297515869141, + "step": 4515 + }, + { + "epoch": 2.419133634387021, + "grad_norm": 19.812438204851706, + "learning_rate": 1.0973708690161143e-07, + "logits/chosen": -0.19067828357219696, + "logits/rejected": -0.10383911430835724, + "logps/chosen": -4.532150745391846, + "logps/rejected": -5.899567127227783, + "loss": 0.4297, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.532150745391846, + "rewards/margins": 1.3674169778823853, + "rewards/rejected": -5.899567127227783, + "sft_loss": 4.567543029785156, + "step": 4520 + }, + { + "epoch": 2.421809667168423, + "grad_norm": 29.735821894641667, + "learning_rate": 1.0876541258267119e-07, + "logits/chosen": -0.2346051186323166, + "logits/rejected": -0.08108378946781158, + "logps/chosen": -4.545472145080566, + "logps/rejected": -5.927277565002441, + "loss": 0.4372, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.545472145080566, + "rewards/margins": 1.3818058967590332, + "rewards/rejected": -5.927277565002441, + "sft_loss": 4.522289752960205, + "step": 4525 + }, + { + "epoch": 2.4244856999498245, + "grad_norm": 23.66699357162162, + "learning_rate": 1.0779753402341379e-07, + "logits/chosen": -0.21056047081947327, + "logits/rejected": -0.13503794372081757, + "logps/chosen": -4.474379539489746, + "logps/rejected": -5.554043769836426, + "loss": 0.5149, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.474379539489746, + "rewards/margins": 1.079664707183838, + "rewards/rejected": -5.554043769836426, + "sft_loss": 4.390741348266602, + "step": 4530 + }, + { + "epoch": 2.427161732731226, + "grad_norm": 21.118495725284937, + "learning_rate": 1.0683346061420157e-07, + "logits/chosen": -0.11235556751489639, + "logits/rejected": -0.03817780688405037, + "logps/chosen": -4.439657688140869, + "logps/rejected": -5.766303062438965, + "loss": 0.4661, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.439657688140869, + "rewards/margins": 1.3266453742980957, + "rewards/rejected": -5.766303062438965, + "sft_loss": 4.571423053741455, + "step": 4535 + }, + { + "epoch": 2.4298377655126275, + "grad_norm": 19.0949745109115, + "learning_rate": 1.0587320170847874e-07, + "logits/chosen": -0.1611403375864029, + "logits/rejected": -0.05527140945196152, + "logps/chosen": -4.475289821624756, + "logps/rejected": -5.60997200012207, + "loss": 0.5012, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.475289821624756, + "rewards/margins": 1.1346817016601562, + "rewards/rejected": -5.60997200012207, + "sft_loss": 4.55513858795166, + "step": 4540 + }, + { + "epoch": 2.4325137982940293, + "grad_norm": 17.935024919965333, + "learning_rate": 1.0491676662268156e-07, + "logits/chosen": -0.1411706805229187, + "logits/rejected": -0.05848114565014839, + "logps/chosen": -4.385329246520996, + "logps/rejected": -5.652647972106934, + "loss": 0.4731, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.385329246520996, + "rewards/margins": 1.2673189640045166, + "rewards/rejected": -5.652647972106934, + "sft_loss": 4.418089389801025, + "step": 4545 + }, + { + "epoch": 2.4351898310754305, + "grad_norm": 23.491598615059704, + "learning_rate": 1.0396416463614732e-07, + "logits/chosen": -0.23309126496315002, + "logits/rejected": -0.13317319750785828, + "logps/chosen": -4.3950018882751465, + "logps/rejected": -5.676304817199707, + "loss": 0.4687, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.3950018882751465, + "rewards/margins": 1.2813031673431396, + "rewards/rejected": -5.676304817199707, + "sft_loss": 4.40733528137207, + "step": 4550 + }, + { + "epoch": 2.4378658638568322, + "grad_norm": 19.125890362345572, + "learning_rate": 1.0301540499102479e-07, + "logits/chosen": -0.1768358051776886, + "logits/rejected": -0.07692781090736389, + "logps/chosen": -4.521668434143066, + "logps/rejected": -5.560128211975098, + "loss": 0.5188, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.521668434143066, + "rewards/margins": 1.0384600162506104, + "rewards/rejected": -5.560128211975098, + "sft_loss": 4.5766682624816895, + "step": 4555 + }, + { + "epoch": 2.440541896638234, + "grad_norm": 26.133069340360283, + "learning_rate": 1.0207049689218405e-07, + "logits/chosen": -0.1954447627067566, + "logits/rejected": -0.040132030844688416, + "logps/chosen": -4.481562614440918, + "logps/rejected": -5.8571977615356445, + "loss": 0.4486, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.481562614440918, + "rewards/margins": 1.3756355047225952, + "rewards/rejected": -5.8571977615356445, + "sft_loss": 4.460504055023193, + "step": 4560 + }, + { + "epoch": 2.4432179294196352, + "grad_norm": 17.47989034183169, + "learning_rate": 1.0112944950712782e-07, + "logits/chosen": -0.19489526748657227, + "logits/rejected": -0.08836908638477325, + "logps/chosen": -4.388577461242676, + "logps/rejected": -5.794994354248047, + "loss": 0.4129, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.388577461242676, + "rewards/margins": 1.4064165353775024, + "rewards/rejected": -5.794994354248047, + "sft_loss": 4.320601463317871, + "step": 4565 + }, + { + "epoch": 2.445893962201037, + "grad_norm": 20.420147657931476, + "learning_rate": 1.0019227196590174e-07, + "logits/chosen": -0.14283771812915802, + "logits/rejected": -0.031055014580488205, + "logps/chosen": -4.619053363800049, + "logps/rejected": -5.802786827087402, + "loss": 0.506, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.619053363800049, + "rewards/margins": 1.1837328672409058, + "rewards/rejected": -5.802786827087402, + "sft_loss": 4.631751537322998, + "step": 4570 + }, + { + "epoch": 2.4485699949824387, + "grad_norm": 16.9951576504403, + "learning_rate": 9.925897336100664e-08, + "logits/chosen": -0.1547360122203827, + "logits/rejected": -0.07837474346160889, + "logps/chosen": -4.421803951263428, + "logps/rejected": -5.884800910949707, + "loss": 0.4068, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.421803951263428, + "rewards/margins": 1.4629971981048584, + "rewards/rejected": -5.884800910949707, + "sft_loss": 4.476590156555176, + "step": 4575 + }, + { + "epoch": 2.45124602776384, + "grad_norm": 24.62143076337965, + "learning_rate": 9.832956274730946e-08, + "logits/chosen": -0.19433817267417908, + "logits/rejected": -0.1463998556137085, + "logps/chosen": -4.4533257484436035, + "logps/rejected": -5.545527935028076, + "loss": 0.509, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.4533257484436035, + "rewards/margins": 1.0922021865844727, + "rewards/rejected": -5.545527935028076, + "sft_loss": 4.443113803863525, + "step": 4580 + }, + { + "epoch": 2.4539220605452416, + "grad_norm": 18.97664228771073, + "learning_rate": 9.740404914195633e-08, + "logits/chosen": -0.20991036295890808, + "logits/rejected": -0.07485126703977585, + "logps/chosen": -4.437937259674072, + "logps/rejected": -5.744898796081543, + "loss": 0.4425, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.437937259674072, + "rewards/margins": 1.3069615364074707, + "rewards/rejected": -5.744898796081543, + "sft_loss": 4.504698276519775, + "step": 4585 + }, + { + "epoch": 2.4565980933266434, + "grad_norm": 15.47708517526208, + "learning_rate": 9.648244152428392e-08, + "logits/chosen": -0.21960651874542236, + "logits/rejected": -0.11510632187128067, + "logps/chosen": -4.449195861816406, + "logps/rejected": -5.644477844238281, + "loss": 0.4695, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.449195861816406, + "rewards/margins": 1.1952823400497437, + "rewards/rejected": -5.644477844238281, + "sft_loss": 4.520184516906738, + "step": 4590 + }, + { + "epoch": 2.4592741261080446, + "grad_norm": 18.900248936085628, + "learning_rate": 9.556474883573379e-08, + "logits/chosen": -0.2260456085205078, + "logits/rejected": -0.11913847923278809, + "logps/chosen": -4.4166579246521, + "logps/rejected": -5.84757137298584, + "loss": 0.4537, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.4166579246521, + "rewards/margins": 1.4309136867523193, + "rewards/rejected": -5.84757137298584, + "sft_loss": 4.432002544403076, + "step": 4595 + }, + { + "epoch": 2.4619501588894463, + "grad_norm": 14.872545795163669, + "learning_rate": 9.465097997976412e-08, + "logits/chosen": -0.19411954283714294, + "logits/rejected": -0.03489295765757561, + "logps/chosen": -4.461108684539795, + "logps/rejected": -5.979926109313965, + "loss": 0.3952, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.461108684539795, + "rewards/margins": 1.5188171863555908, + "rewards/rejected": -5.979926109313965, + "sft_loss": 4.563147068023682, + "step": 4600 + }, + { + "epoch": 2.464626191670848, + "grad_norm": 18.348094072972412, + "learning_rate": 9.374114382176457e-08, + "logits/chosen": -0.1995922327041626, + "logits/rejected": -0.08010158687829971, + "logps/chosen": -4.613968372344971, + "logps/rejected": -5.970013618469238, + "loss": 0.444, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.613968372344971, + "rewards/margins": 1.356046199798584, + "rewards/rejected": -5.970013618469238, + "sft_loss": 4.647368907928467, + "step": 4605 + }, + { + "epoch": 2.46730222445225, + "grad_norm": 21.471179179679307, + "learning_rate": 9.283524918896945e-08, + "logits/chosen": -0.20935878157615662, + "logits/rejected": -0.1149827390909195, + "logps/chosen": -4.587584495544434, + "logps/rejected": -5.908937931060791, + "loss": 0.472, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.587584495544434, + "rewards/margins": 1.321352243423462, + "rewards/rejected": -5.908937931060791, + "sft_loss": 4.5859479904174805, + "step": 4610 + }, + { + "epoch": 2.469978257233651, + "grad_norm": 21.406477229056268, + "learning_rate": 9.193330487037232e-08, + "logits/chosen": -0.14232993125915527, + "logits/rejected": 0.005499015562236309, + "logps/chosen": -4.5797929763793945, + "logps/rejected": -5.923649787902832, + "loss": 0.4475, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.5797929763793945, + "rewards/margins": 1.3438562154769897, + "rewards/rejected": -5.923649787902832, + "sft_loss": 4.64937162399292, + "step": 4615 + }, + { + "epoch": 2.4726542900150528, + "grad_norm": 17.06758141464982, + "learning_rate": 9.103531961664118e-08, + "logits/chosen": -0.17623497545719147, + "logits/rejected": -0.02270549163222313, + "logps/chosen": -4.4073710441589355, + "logps/rejected": -5.731764793395996, + "loss": 0.4124, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.4073710441589355, + "rewards/margins": 1.3243931531906128, + "rewards/rejected": -5.731764793395996, + "sft_loss": 4.500627517700195, + "step": 4620 + }, + { + "epoch": 2.475330322796454, + "grad_norm": 19.144035150054716, + "learning_rate": 9.014130214003269e-08, + "logits/chosen": -0.2248564213514328, + "logits/rejected": -0.2040160447359085, + "logps/chosen": -4.470706939697266, + "logps/rejected": -5.8308563232421875, + "loss": 0.4544, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.470706939697266, + "rewards/margins": 1.3601499795913696, + "rewards/rejected": -5.8308563232421875, + "sft_loss": 4.481385231018066, + "step": 4625 + }, + { + "epoch": 2.4780063555778558, + "grad_norm": 22.95375049487042, + "learning_rate": 8.925126111430848e-08, + "logits/chosen": -0.1677057445049286, + "logits/rejected": -0.08975866436958313, + "logps/chosen": -4.464261531829834, + "logps/rejected": -5.763180732727051, + "loss": 0.4599, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.464261531829834, + "rewards/margins": 1.2989187240600586, + "rewards/rejected": -5.763180732727051, + "sft_loss": 4.50925874710083, + "step": 4630 + }, + { + "epoch": 2.4806823883592575, + "grad_norm": 24.606368604482732, + "learning_rate": 8.83652051746504e-08, + "logits/chosen": -0.08617188036441803, + "logits/rejected": 0.027888232842087746, + "logps/chosen": -4.720548629760742, + "logps/rejected": -6.075860023498535, + "loss": 0.4594, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.720548629760742, + "rewards/margins": 1.3553111553192139, + "rewards/rejected": -6.075860023498535, + "sft_loss": 4.730906009674072, + "step": 4635 + }, + { + "epoch": 2.483358421140659, + "grad_norm": 20.88976247783765, + "learning_rate": 8.748314291757696e-08, + "logits/chosen": -0.17314712703227997, + "logits/rejected": -0.0717170462012291, + "logps/chosen": -4.5507965087890625, + "logps/rejected": -5.720621585845947, + "loss": 0.4765, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.5507965087890625, + "rewards/margins": 1.1698250770568848, + "rewards/rejected": -5.720621585845947, + "sft_loss": 4.556031703948975, + "step": 4640 + }, + { + "epoch": 2.4860344539220605, + "grad_norm": 21.419404863988852, + "learning_rate": 8.660508290086032e-08, + "logits/chosen": -0.1612890064716339, + "logits/rejected": -0.03897259384393692, + "logps/chosen": -4.47631311416626, + "logps/rejected": -5.901510715484619, + "loss": 0.4277, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.47631311416626, + "rewards/margins": 1.4251978397369385, + "rewards/rejected": -5.901510715484619, + "sft_loss": 4.563240051269531, + "step": 4645 + }, + { + "epoch": 2.488710486703462, + "grad_norm": 22.99349099347903, + "learning_rate": 8.573103364344231e-08, + "logits/chosen": -0.19505569338798523, + "logits/rejected": -0.03052748367190361, + "logps/chosen": -4.442262649536133, + "logps/rejected": -5.754072189331055, + "loss": 0.455, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.442262649536133, + "rewards/margins": 1.311809778213501, + "rewards/rejected": -5.754072189331055, + "sft_loss": 4.400628089904785, + "step": 4650 + }, + { + "epoch": 2.4913865194848634, + "grad_norm": 24.39677745821148, + "learning_rate": 8.486100362535292e-08, + "logits/chosen": -0.21414044499397278, + "logits/rejected": -0.08336290717124939, + "logps/chosen": -4.606078624725342, + "logps/rejected": -5.677779197692871, + "loss": 0.5027, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.606078624725342, + "rewards/margins": 1.0717008113861084, + "rewards/rejected": -5.677779197692871, + "sft_loss": 4.675948143005371, + "step": 4655 + }, + { + "epoch": 2.494062552266265, + "grad_norm": 16.67601373235496, + "learning_rate": 8.399500128762693e-08, + "logits/chosen": -0.21918518841266632, + "logits/rejected": -0.11786095798015594, + "logps/chosen": -4.559045314788818, + "logps/rejected": -5.861749172210693, + "loss": 0.4316, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.559045314788818, + "rewards/margins": 1.3027039766311646, + "rewards/rejected": -5.861749172210693, + "sft_loss": 4.51124906539917, + "step": 4660 + }, + { + "epoch": 2.496738585047667, + "grad_norm": 24.482882384709082, + "learning_rate": 8.313303503222313e-08, + "logits/chosen": -0.17604468762874603, + "logits/rejected": -0.08922283351421356, + "logps/chosen": -4.497008800506592, + "logps/rejected": -5.698709487915039, + "loss": 0.4736, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.497008800506592, + "rewards/margins": 1.2017009258270264, + "rewards/rejected": -5.698709487915039, + "sft_loss": 4.484135627746582, + "step": 4665 + }, + { + "epoch": 2.4994146178290686, + "grad_norm": 23.61614221979428, + "learning_rate": 8.227511322194164e-08, + "logits/chosen": -0.18093156814575195, + "logits/rejected": -0.06446141004562378, + "logps/chosen": -4.420372486114502, + "logps/rejected": -5.614178657531738, + "loss": 0.4644, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.420372486114502, + "rewards/margins": 1.1938055753707886, + "rewards/rejected": -5.614178657531738, + "sft_loss": 4.375947952270508, + "step": 4670 + }, + { + "epoch": 2.50209065061047, + "grad_norm": 20.222681675125408, + "learning_rate": 8.142124418034385e-08, + "logits/chosen": -0.14194490015506744, + "logits/rejected": -0.02431192621588707, + "logps/chosen": -4.505153179168701, + "logps/rejected": -5.789482593536377, + "loss": 0.4896, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.505153179168701, + "rewards/margins": 1.284328579902649, + "rewards/rejected": -5.789482593536377, + "sft_loss": 4.513728141784668, + "step": 4675 + }, + { + "epoch": 2.5047666833918716, + "grad_norm": 26.123162472684733, + "learning_rate": 8.057143619167073e-08, + "logits/chosen": -0.13329777121543884, + "logits/rejected": -0.041541434824466705, + "logps/chosen": -4.424647331237793, + "logps/rejected": -5.76423978805542, + "loss": 0.4591, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.424647331237793, + "rewards/margins": 1.339592695236206, + "rewards/rejected": -5.76423978805542, + "sft_loss": 4.433927536010742, + "step": 4680 + }, + { + "epoch": 2.507442716173273, + "grad_norm": 18.289289506883286, + "learning_rate": 7.97256975007633e-08, + "logits/chosen": -0.19680531322956085, + "logits/rejected": -0.023869935423135757, + "logps/chosen": -4.395602226257324, + "logps/rejected": -5.696691036224365, + "loss": 0.4444, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.395602226257324, + "rewards/margins": 1.3010880947113037, + "rewards/rejected": -5.696691036224365, + "sft_loss": 4.393080711364746, + "step": 4685 + }, + { + "epoch": 2.5101187489546746, + "grad_norm": 23.42798024718646, + "learning_rate": 7.888403631298186e-08, + "logits/chosen": -0.16270551085472107, + "logits/rejected": -0.09330668300390244, + "logps/chosen": -4.4305620193481445, + "logps/rejected": -5.681832313537598, + "loss": 0.4913, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.4305620193481445, + "rewards/margins": 1.2512702941894531, + "rewards/rejected": -5.681832313537598, + "sft_loss": 4.389323711395264, + "step": 4690 + }, + { + "epoch": 2.5127947817360763, + "grad_norm": 18.106279307173804, + "learning_rate": 7.804646079412719e-08, + "logits/chosen": -0.12708857655525208, + "logits/rejected": 0.012785923667252064, + "logps/chosen": -4.554836750030518, + "logps/rejected": -5.922824859619141, + "loss": 0.4373, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.554836750030518, + "rewards/margins": 1.367988109588623, + "rewards/rejected": -5.922824859619141, + "sft_loss": 4.59060525894165, + "step": 4695 + }, + { + "epoch": 2.515470814517478, + "grad_norm": 20.937481813773303, + "learning_rate": 7.72129790703604e-08, + "logits/chosen": -0.23682132363319397, + "logits/rejected": -0.12528641521930695, + "logps/chosen": -4.4873552322387695, + "logps/rejected": -5.65595006942749, + "loss": 0.4756, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.4873552322387695, + "rewards/margins": 1.1685948371887207, + "rewards/rejected": -5.65595006942749, + "sft_loss": 4.530056953430176, + "step": 4700 + }, + { + "epoch": 2.5181468472988793, + "grad_norm": 21.90677504227491, + "learning_rate": 7.638359922812504e-08, + "logits/chosen": -0.16787834465503693, + "logits/rejected": -0.11186468601226807, + "logps/chosen": -4.4697723388671875, + "logps/rejected": -5.794044494628906, + "loss": 0.4555, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.4697723388671875, + "rewards/margins": 1.3242720365524292, + "rewards/rejected": -5.794044494628906, + "sft_loss": 4.4074015617370605, + "step": 4705 + }, + { + "epoch": 2.520822880080281, + "grad_norm": 30.185251138033113, + "learning_rate": 7.555832931406774e-08, + "logits/chosen": -0.20095805823802948, + "logits/rejected": -0.054882220923900604, + "logps/chosen": -4.525169372558594, + "logps/rejected": -5.853812217712402, + "loss": 0.4558, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.525169372558594, + "rewards/margins": 1.3286423683166504, + "rewards/rejected": -5.853812217712402, + "sft_loss": 4.510501384735107, + "step": 4710 + }, + { + "epoch": 2.5234989128616827, + "grad_norm": 16.891355001288364, + "learning_rate": 7.47371773349611e-08, + "logits/chosen": -0.12636831402778625, + "logits/rejected": -0.0890660285949707, + "logps/chosen": -4.470862865447998, + "logps/rejected": -5.971408843994141, + "loss": 0.3888, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.470862865447998, + "rewards/margins": 1.5005453824996948, + "rewards/rejected": -5.971408843994141, + "sft_loss": 4.454566955566406, + "step": 4715 + }, + { + "epoch": 2.526174945643084, + "grad_norm": 22.97275851657326, + "learning_rate": 7.392015125762496e-08, + "logits/chosen": -0.2002909630537033, + "logits/rejected": -0.07835756242275238, + "logps/chosen": -4.467009544372559, + "logps/rejected": -5.867058753967285, + "loss": 0.4153, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.467009544372559, + "rewards/margins": 1.4000494480133057, + "rewards/rejected": -5.867058753967285, + "sft_loss": 4.419620513916016, + "step": 4720 + }, + { + "epoch": 2.5288509784244857, + "grad_norm": 21.449232242890638, + "learning_rate": 7.310725900885018e-08, + "logits/chosen": -0.20349101722240448, + "logits/rejected": -0.12118975818157196, + "logps/chosen": -4.590170383453369, + "logps/rejected": -5.913938999176025, + "loss": 0.4874, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.590170383453369, + "rewards/margins": 1.3237687349319458, + "rewards/rejected": -5.913938999176025, + "sft_loss": 4.63556432723999, + "step": 4725 + }, + { + "epoch": 2.5315270112058874, + "grad_norm": 23.121120970429534, + "learning_rate": 7.229850847532076e-08, + "logits/chosen": -0.1534847766160965, + "logits/rejected": -0.010326864197850227, + "logps/chosen": -4.503686428070068, + "logps/rejected": -5.993782043457031, + "loss": 0.3923, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.503686428070068, + "rewards/margins": 1.490094542503357, + "rewards/rejected": -5.993782043457031, + "sft_loss": 4.55232572555542, + "step": 4730 + }, + { + "epoch": 2.5342030439872887, + "grad_norm": 23.65333237348769, + "learning_rate": 7.149390750353779e-08, + "logits/chosen": -0.13787660002708435, + "logits/rejected": -0.1245235949754715, + "logps/chosen": -4.618861198425293, + "logps/rejected": -5.837153434753418, + "loss": 0.4411, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.618861198425293, + "rewards/margins": 1.2182929515838623, + "rewards/rejected": -5.837153434753418, + "sft_loss": 4.580845832824707, + "step": 4735 + }, + { + "epoch": 2.5368790767686904, + "grad_norm": 15.40543868202436, + "learning_rate": 7.069346389974374e-08, + "logits/chosen": -0.16966880857944489, + "logits/rejected": -0.0535128228366375, + "logps/chosen": -4.637502670288086, + "logps/rejected": -5.820466995239258, + "loss": 0.4795, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.637502670288086, + "rewards/margins": 1.1829637289047241, + "rewards/rejected": -5.820466995239258, + "sft_loss": 4.6907525062561035, + "step": 4740 + }, + { + "epoch": 2.539555109550092, + "grad_norm": 21.93460157693937, + "learning_rate": 6.989718542984563e-08, + "logits/chosen": -0.16312888264656067, + "logits/rejected": -0.10609817504882812, + "logps/chosen": -4.652926445007324, + "logps/rejected": -5.991219520568848, + "loss": 0.4601, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.652926445007324, + "rewards/margins": 1.3382928371429443, + "rewards/rejected": -5.991219520568848, + "sft_loss": 4.78752326965332, + "step": 4745 + }, + { + "epoch": 2.5422311423314934, + "grad_norm": 17.835844674143832, + "learning_rate": 6.9105079819341e-08, + "logits/chosen": -0.13755422830581665, + "logits/rejected": 0.015838632360100746, + "logps/chosen": -4.4717793464660645, + "logps/rejected": -6.0261688232421875, + "loss": 0.3881, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.4717793464660645, + "rewards/margins": 1.55439031124115, + "rewards/rejected": -6.0261688232421875, + "sft_loss": 4.499505996704102, + "step": 4750 + }, + { + "epoch": 2.544907175112895, + "grad_norm": 19.09786893711236, + "learning_rate": 6.831715475324163e-08, + "logits/chosen": -0.21794326603412628, + "logits/rejected": -0.10194908082485199, + "logps/chosen": -4.589028358459473, + "logps/rejected": -6.074400424957275, + "loss": 0.4358, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.589028358459473, + "rewards/margins": 1.485371708869934, + "rewards/rejected": -6.074400424957275, + "sft_loss": 4.711187362670898, + "step": 4755 + }, + { + "epoch": 2.547583207894297, + "grad_norm": 18.429959869226373, + "learning_rate": 6.753341787600026e-08, + "logits/chosen": -0.19435083866119385, + "logits/rejected": -0.1180337518453598, + "logps/chosen": -4.5188517570495605, + "logps/rejected": -6.058254718780518, + "loss": 0.3959, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.5188517570495605, + "rewards/margins": 1.5394032001495361, + "rewards/rejected": -6.058254718780518, + "sft_loss": 4.598109722137451, + "step": 4760 + }, + { + "epoch": 2.5502592406756985, + "grad_norm": 23.62838873223034, + "learning_rate": 6.67538767914353e-08, + "logits/chosen": -0.21478557586669922, + "logits/rejected": -0.09002258628606796, + "logps/chosen": -4.732692241668701, + "logps/rejected": -5.903644561767578, + "loss": 0.4967, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.732692241668701, + "rewards/margins": 1.1709522008895874, + "rewards/rejected": -5.903644561767578, + "sft_loss": 4.822795391082764, + "step": 4765 + }, + { + "epoch": 2.5529352734571, + "grad_norm": 24.455111835636977, + "learning_rate": 6.597853906265793e-08, + "logits/chosen": -0.1903330385684967, + "logits/rejected": -0.08132892847061157, + "logps/chosen": -4.575113773345947, + "logps/rejected": -6.173352241516113, + "loss": 0.4173, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.575113773345947, + "rewards/margins": 1.5982379913330078, + "rewards/rejected": -6.173352241516113, + "sft_loss": 4.533818244934082, + "step": 4770 + }, + { + "epoch": 2.5556113062385015, + "grad_norm": 25.654521999410346, + "learning_rate": 6.5207412211998e-08, + "logits/chosen": -0.08815367519855499, + "logits/rejected": -0.01084714476019144, + "logps/chosen": -4.67705774307251, + "logps/rejected": -5.991430282592773, + "loss": 0.5073, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.67705774307251, + "rewards/margins": 1.3143724203109741, + "rewards/rejected": -5.991430282592773, + "sft_loss": 4.634039878845215, + "step": 4775 + }, + { + "epoch": 2.558287339019903, + "grad_norm": 16.36428300774091, + "learning_rate": 6.444050372093186e-08, + "logits/chosen": -0.21527810394763947, + "logits/rejected": -0.10454368591308594, + "logps/chosen": -4.534337043762207, + "logps/rejected": -5.820903778076172, + "loss": 0.4397, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.534337043762207, + "rewards/margins": 1.2865673303604126, + "rewards/rejected": -5.820903778076172, + "sft_loss": 4.576798439025879, + "step": 4780 + }, + { + "epoch": 2.5609633718013045, + "grad_norm": 24.91226273927461, + "learning_rate": 6.367782103000873e-08, + "logits/chosen": -0.16502061486244202, + "logits/rejected": -0.11993427574634552, + "logps/chosen": -4.4962334632873535, + "logps/rejected": -5.591463088989258, + "loss": 0.5122, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.4962334632873535, + "rewards/margins": 1.0952298641204834, + "rewards/rejected": -5.591463088989258, + "sft_loss": 4.489527702331543, + "step": 4785 + }, + { + "epoch": 2.5636394045827062, + "grad_norm": 20.207249255688556, + "learning_rate": 6.29193715387798e-08, + "logits/chosen": -0.22081604599952698, + "logits/rejected": -0.12781080603599548, + "logps/chosen": -4.531004905700684, + "logps/rejected": -5.898108959197998, + "loss": 0.4655, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.531004905700684, + "rewards/margins": 1.3671048879623413, + "rewards/rejected": -5.898108959197998, + "sft_loss": 4.539250373840332, + "step": 4790 + }, + { + "epoch": 2.566315437364108, + "grad_norm": 25.688124298764876, + "learning_rate": 6.216516260572502e-08, + "logits/chosen": -0.15706852078437805, + "logits/rejected": -0.07518056035041809, + "logps/chosen": -4.669719696044922, + "logps/rejected": -5.945387363433838, + "loss": 0.4959, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.669719696044922, + "rewards/margins": 1.275667667388916, + "rewards/rejected": -5.945387363433838, + "sft_loss": 4.66204309463501, + "step": 4795 + }, + { + "epoch": 2.568991470145509, + "grad_norm": 15.85847980111402, + "learning_rate": 6.141520154818297e-08, + "logits/chosen": -0.16903331875801086, + "logits/rejected": -0.08962948620319366, + "logps/chosen": -4.538792610168457, + "logps/rejected": -5.683047771453857, + "loss": 0.488, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.538792610168457, + "rewards/margins": 1.1442553997039795, + "rewards/rejected": -5.683047771453857, + "sft_loss": 4.609192848205566, + "step": 4800 + }, + { + "epoch": 2.568991470145509, + "eval_logits/chosen": 0.04863176867365837, + "eval_logits/rejected": 0.12808875739574432, + "eval_logps/chosen": -4.621333599090576, + "eval_logps/rejected": -5.670347690582275, + "eval_loss": 0.5791551470756531, + "eval_rewards/accuracies": 0.7314540147781372, + "eval_rewards/chosen": -4.621333599090576, + "eval_rewards/margins": 1.0490138530731201, + "eval_rewards/rejected": -5.670347690582275, + "eval_runtime": 43.0522, + "eval_samples_per_second": 31.241, + "eval_sft_loss": 4.5805439949035645, + "eval_steps_per_second": 7.828, + "step": 4800 + }, + { + "epoch": 2.571667502926911, + "grad_norm": 28.40401193081405, + "learning_rate": 6.066949564227897e-08, + "logits/chosen": -0.22222542762756348, + "logits/rejected": -0.12885430455207825, + "logps/chosen": -4.472836494445801, + "logps/rejected": -5.696991920471191, + "loss": 0.5093, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.472836494445801, + "rewards/margins": 1.2241547107696533, + "rewards/rejected": -5.696991920471191, + "sft_loss": 4.47830867767334, + "step": 4805 + }, + { + "epoch": 2.574343535708312, + "grad_norm": 19.21048232317064, + "learning_rate": 5.992805212285523e-08, + "logits/chosen": -0.1699303239583969, + "logits/rejected": -0.09898178279399872, + "logps/chosen": -4.3903422355651855, + "logps/rejected": -5.721817970275879, + "loss": 0.4544, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.3903422355651855, + "rewards/margins": 1.3314763307571411, + "rewards/rejected": -5.721817970275879, + "sft_loss": 4.442036151885986, + "step": 4810 + }, + { + "epoch": 2.577019568489714, + "grad_norm": 23.669309007915814, + "learning_rate": 5.9190878183399684e-08, + "logits/chosen": -0.1649537980556488, + "logits/rejected": -0.08180878311395645, + "logps/chosen": -4.342033863067627, + "logps/rejected": -5.731744766235352, + "loss": 0.4991, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.342033863067627, + "rewards/margins": 1.3897111415863037, + "rewards/rejected": -5.731744766235352, + "sft_loss": 4.478572845458984, + "step": 4815 + }, + { + "epoch": 2.5796956012711156, + "grad_norm": 23.418773392822963, + "learning_rate": 5.845798097597748e-08, + "logits/chosen": -0.15347550809383392, + "logits/rejected": -0.06517787277698517, + "logps/chosen": -4.519292831420898, + "logps/rejected": -5.659243583679199, + "loss": 0.4812, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.519292831420898, + "rewards/margins": 1.1399506330490112, + "rewards/rejected": -5.659243583679199, + "sft_loss": 4.461321830749512, + "step": 4820 + }, + { + "epoch": 2.5823716340525174, + "grad_norm": 21.89645608677314, + "learning_rate": 5.772936761116026e-08, + "logits/chosen": -0.1391296088695526, + "logits/rejected": -0.016931544989347458, + "logps/chosen": -4.406808376312256, + "logps/rejected": -5.737373352050781, + "loss": 0.444, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.406808376312256, + "rewards/margins": 1.3305647373199463, + "rewards/rejected": -5.737373352050781, + "sft_loss": 4.336057662963867, + "step": 4825 + }, + { + "epoch": 2.5850476668339186, + "grad_norm": 26.078054368087265, + "learning_rate": 5.700504515795829e-08, + "logits/chosen": -0.1936092972755432, + "logits/rejected": -0.07124066352844238, + "logps/chosen": -4.561959266662598, + "logps/rejected": -5.777171611785889, + "loss": 0.4782, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.561959266662598, + "rewards/margins": 1.2152132987976074, + "rewards/rejected": -5.777171611785889, + "sft_loss": 4.577217102050781, + "step": 4830 + }, + { + "epoch": 2.5877236996153203, + "grad_norm": 24.452744091437754, + "learning_rate": 5.628502064375101e-08, + "logits/chosen": -0.2836676239967346, + "logits/rejected": -0.13801008462905884, + "logps/chosen": -4.371440410614014, + "logps/rejected": -5.8245134353637695, + "loss": 0.3939, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.371440410614014, + "rewards/margins": 1.4530731439590454, + "rewards/rejected": -5.8245134353637695, + "sft_loss": 4.396610736846924, + "step": 4835 + }, + { + "epoch": 2.5903997323967216, + "grad_norm": 26.04172258859895, + "learning_rate": 5.55693010542197e-08, + "logits/chosen": -0.22704604268074036, + "logits/rejected": -0.06665907800197601, + "logps/chosen": -4.336719989776611, + "logps/rejected": -5.717637538909912, + "loss": 0.4088, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.336719989776611, + "rewards/margins": 1.3809177875518799, + "rewards/rejected": -5.717637538909912, + "sft_loss": 4.287871360778809, + "step": 4840 + }, + { + "epoch": 2.5930757651781233, + "grad_norm": 19.417064570155066, + "learning_rate": 5.485789333327856e-08, + "logits/chosen": -0.17872127890586853, + "logits/rejected": -0.13257905840873718, + "logps/chosen": -4.509469985961914, + "logps/rejected": -5.748656272888184, + "loss": 0.4598, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.509469985961914, + "rewards/margins": 1.2391860485076904, + "rewards/rejected": -5.748656272888184, + "sft_loss": 4.612636566162109, + "step": 4845 + }, + { + "epoch": 2.595751797959525, + "grad_norm": 22.393659481421555, + "learning_rate": 5.4150804383008675e-08, + "logits/chosen": -0.2519914507865906, + "logits/rejected": -0.1284635365009308, + "logps/chosen": -4.571159839630127, + "logps/rejected": -5.952051162719727, + "loss": 0.4556, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.571159839630127, + "rewards/margins": 1.3808910846710205, + "rewards/rejected": -5.952051162719727, + "sft_loss": 4.579138278961182, + "step": 4850 + }, + { + "epoch": 2.5984278307409268, + "grad_norm": 23.395650557521304, + "learning_rate": 5.344804106359002e-08, + "logits/chosen": -0.1402450054883957, + "logits/rejected": -0.042079776525497437, + "logps/chosen": -4.393143177032471, + "logps/rejected": -5.735674858093262, + "loss": 0.4525, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.393143177032471, + "rewards/margins": 1.3425315618515015, + "rewards/rejected": -5.735674858093262, + "sft_loss": 4.468847751617432, + "step": 4855 + }, + { + "epoch": 2.601103863522328, + "grad_norm": 21.597640854860202, + "learning_rate": 5.274961019323559e-08, + "logits/chosen": -0.23359227180480957, + "logits/rejected": -0.16883578896522522, + "logps/chosen": -4.371655464172363, + "logps/rejected": -5.632245063781738, + "loss": 0.4662, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.371655464172363, + "rewards/margins": 1.2605891227722168, + "rewards/rejected": -5.632245063781738, + "sft_loss": 4.476510047912598, + "step": 4860 + }, + { + "epoch": 2.6037798963037297, + "grad_norm": 14.828011791421977, + "learning_rate": 5.205551854812451e-08, + "logits/chosen": -0.231710746884346, + "logits/rejected": -0.15900401771068573, + "logps/chosen": -4.545974254608154, + "logps/rejected": -5.904170989990234, + "loss": 0.4425, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.545974254608154, + "rewards/margins": 1.3581969738006592, + "rewards/rejected": -5.904170989990234, + "sft_loss": 4.569455146789551, + "step": 4865 + }, + { + "epoch": 2.606455929085131, + "grad_norm": 17.522975343074165, + "learning_rate": 5.1365772862337177e-08, + "logits/chosen": -0.17051482200622559, + "logits/rejected": -0.05159657076001167, + "logps/chosen": -4.31076717376709, + "logps/rejected": -5.912814140319824, + "loss": 0.3718, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.31076717376709, + "rewards/margins": 1.6020472049713135, + "rewards/rejected": -5.912814140319824, + "sft_loss": 4.290585517883301, + "step": 4870 + }, + { + "epoch": 2.6091319618665327, + "grad_norm": 23.416699694984377, + "learning_rate": 5.068037982778905e-08, + "logits/chosen": -0.15978005528450012, + "logits/rejected": -0.07179448008537292, + "logps/chosen": -4.384526252746582, + "logps/rejected": -5.747738838195801, + "loss": 0.4704, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.384526252746582, + "rewards/margins": 1.3632128238677979, + "rewards/rejected": -5.747738838195801, + "sft_loss": 4.449900150299072, + "step": 4875 + }, + { + "epoch": 2.6118079946479344, + "grad_norm": 13.224366039507197, + "learning_rate": 4.999934609416656e-08, + "logits/chosen": -0.1055552214384079, + "logits/rejected": -0.021091172471642494, + "logps/chosen": -4.444762229919434, + "logps/rejected": -5.985116958618164, + "loss": 0.4185, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.444762229919434, + "rewards/margins": 1.5403542518615723, + "rewards/rejected": -5.985116958618164, + "sft_loss": 4.531604290008545, + "step": 4880 + }, + { + "epoch": 2.614484027429336, + "grad_norm": 18.263200480543908, + "learning_rate": 4.932267826886183e-08, + "logits/chosen": -0.14596322178840637, + "logits/rejected": -0.07446818053722382, + "logps/chosen": -4.471606254577637, + "logps/rejected": -5.95240592956543, + "loss": 0.4304, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.471606254577637, + "rewards/margins": 1.4807993173599243, + "rewards/rejected": -5.95240592956543, + "sft_loss": 4.543080806732178, + "step": 4885 + }, + { + "epoch": 2.6171600602107374, + "grad_norm": 24.880001903936733, + "learning_rate": 4.8650382916909206e-08, + "logits/chosen": -0.24391219019889832, + "logits/rejected": -0.12067997455596924, + "logps/chosen": -4.523651599884033, + "logps/rejected": -5.8638715744018555, + "loss": 0.4841, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.523651599884033, + "rewards/margins": 1.3402198553085327, + "rewards/rejected": -5.8638715744018555, + "sft_loss": 4.605062007904053, + "step": 4890 + }, + { + "epoch": 2.619836092992139, + "grad_norm": 16.600718727582695, + "learning_rate": 4.7982466560920976e-08, + "logits/chosen": -0.17819949984550476, + "logits/rejected": -0.09157485514879227, + "logps/chosen": -4.5048604011535645, + "logps/rejected": -5.635102272033691, + "loss": 0.4862, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.5048604011535645, + "rewards/margins": 1.130241870880127, + "rewards/rejected": -5.635102272033691, + "sft_loss": 4.5297441482543945, + "step": 4895 + }, + { + "epoch": 2.622512125773541, + "grad_norm": 20.472988122442064, + "learning_rate": 4.7318935681024685e-08, + "logits/chosen": -0.1733657419681549, + "logits/rejected": -0.02662757597863674, + "logps/chosen": -4.53520393371582, + "logps/rejected": -5.874253273010254, + "loss": 0.4225, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.53520393371582, + "rewards/margins": 1.3390496969223022, + "rewards/rejected": -5.874253273010254, + "sft_loss": 4.551875591278076, + "step": 4900 + }, + { + "epoch": 2.625188158554942, + "grad_norm": 18.341533084524283, + "learning_rate": 4.6659796714799745e-08, + "logits/chosen": -0.19445575773715973, + "logits/rejected": -0.06603357195854187, + "logps/chosen": -4.4857587814331055, + "logps/rejected": -5.981295108795166, + "loss": 0.4073, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.4857587814331055, + "rewards/margins": 1.4955353736877441, + "rewards/rejected": -5.981295108795166, + "sft_loss": 4.578952789306641, + "step": 4905 + }, + { + "epoch": 2.627864191336344, + "grad_norm": 19.871427369730235, + "learning_rate": 4.60050560572155e-08, + "logits/chosen": -0.18480312824249268, + "logits/rejected": -0.17410850524902344, + "logps/chosen": -4.46319580078125, + "logps/rejected": -5.887537956237793, + "loss": 0.4518, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.46319580078125, + "rewards/margins": 1.4243428707122803, + "rewards/rejected": -5.887537956237793, + "sft_loss": 4.509976387023926, + "step": 4910 + }, + { + "epoch": 2.6305402241177456, + "grad_norm": 24.37262155582398, + "learning_rate": 4.535472006056834e-08, + "logits/chosen": -0.1744360476732254, + "logits/rejected": -0.06569372117519379, + "logps/chosen": -4.4764838218688965, + "logps/rejected": -5.778898239135742, + "loss": 0.4683, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.4764838218688965, + "rewards/margins": 1.3024141788482666, + "rewards/rejected": -5.778898239135742, + "sft_loss": 4.4765305519104, + "step": 4915 + }, + { + "epoch": 2.6332162568991473, + "grad_norm": 24.944943946905017, + "learning_rate": 4.470879503442132e-08, + "logits/chosen": -0.19176459312438965, + "logits/rejected": -0.10058772563934326, + "logps/chosen": -4.545384883880615, + "logps/rejected": -5.858805179595947, + "loss": 0.4509, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.545384883880615, + "rewards/margins": 1.313420057296753, + "rewards/rejected": -5.858805179595947, + "sft_loss": 4.5715155601501465, + "step": 4920 + }, + { + "epoch": 2.6358922896805486, + "grad_norm": 21.66738204995071, + "learning_rate": 4.406728724554154e-08, + "logits/chosen": -0.29400959610939026, + "logits/rejected": -0.11239423602819443, + "logps/chosen": -4.527563571929932, + "logps/rejected": -5.918554782867432, + "loss": 0.4511, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.527563571929932, + "rewards/margins": 1.3909912109375, + "rewards/rejected": -5.918554782867432, + "sft_loss": 4.582327365875244, + "step": 4925 + }, + { + "epoch": 2.6385683224619503, + "grad_norm": 18.703531975391247, + "learning_rate": 4.3430202917840664e-08, + "logits/chosen": -0.17439576983451843, + "logits/rejected": -0.0235174261033535, + "logps/chosen": -4.596892356872559, + "logps/rejected": -6.077221870422363, + "loss": 0.4409, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.596892356872559, + "rewards/margins": 1.4803297519683838, + "rewards/rejected": -6.077221870422363, + "sft_loss": 4.584397315979004, + "step": 4930 + }, + { + "epoch": 2.6412443552433515, + "grad_norm": 27.68319198176985, + "learning_rate": 4.279754823231346e-08, + "logits/chosen": -0.22327034175395966, + "logits/rejected": -0.07386596500873566, + "logps/chosen": -4.4573798179626465, + "logps/rejected": -5.789246559143066, + "loss": 0.4672, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.4573798179626465, + "rewards/margins": 1.3318663835525513, + "rewards/rejected": -5.789246559143066, + "sft_loss": 4.468945026397705, + "step": 4935 + }, + { + "epoch": 2.6439203880247533, + "grad_norm": 17.45229984530719, + "learning_rate": 4.216932932697859e-08, + "logits/chosen": -0.206891268491745, + "logits/rejected": -0.12008295953273773, + "logps/chosen": -4.3516435623168945, + "logps/rejected": -5.558261871337891, + "loss": 0.4572, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.3516435623168945, + "rewards/margins": 1.206618309020996, + "rewards/rejected": -5.558261871337891, + "sft_loss": 4.415630340576172, + "step": 4940 + }, + { + "epoch": 2.646596420806155, + "grad_norm": 20.112252743949814, + "learning_rate": 4.154555229681844e-08, + "logits/chosen": -0.17998364567756653, + "logits/rejected": -0.018372971564531326, + "logps/chosen": -4.503326416015625, + "logps/rejected": -5.923501968383789, + "loss": 0.4281, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.503326416015625, + "rewards/margins": 1.4201748371124268, + "rewards/rejected": -5.923501968383789, + "sft_loss": 4.460989952087402, + "step": 4945 + }, + { + "epoch": 2.6492724535875567, + "grad_norm": 20.59258516506547, + "learning_rate": 4.092622319372069e-08, + "logits/chosen": -0.13803830742835999, + "logits/rejected": -0.009364024735987186, + "logps/chosen": -4.485349178314209, + "logps/rejected": -5.7924652099609375, + "loss": 0.4846, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.485349178314209, + "rewards/margins": 1.307115912437439, + "rewards/rejected": -5.7924652099609375, + "sft_loss": 4.460837364196777, + "step": 4950 + }, + { + "epoch": 2.651948486368958, + "grad_norm": 20.11852869952089, + "learning_rate": 4.031134802641889e-08, + "logits/chosen": -0.19584974646568298, + "logits/rejected": -0.13580578565597534, + "logps/chosen": -4.634210586547852, + "logps/rejected": -5.871407985687256, + "loss": 0.4463, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.634210586547852, + "rewards/margins": 1.2371981143951416, + "rewards/rejected": -5.871407985687256, + "sft_loss": 4.665798664093018, + "step": 4955 + }, + { + "epoch": 2.6546245191503597, + "grad_norm": 17.373477823994854, + "learning_rate": 3.970093276043468e-08, + "logits/chosen": -0.1474190056324005, + "logits/rejected": -0.04052499681711197, + "logps/chosen": -4.419074058532715, + "logps/rejected": -5.859983444213867, + "loss": 0.4256, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.419074058532715, + "rewards/margins": 1.4409091472625732, + "rewards/rejected": -5.859983444213867, + "sft_loss": 4.4970293045043945, + "step": 4960 + }, + { + "epoch": 2.657300551931761, + "grad_norm": 29.72961340708789, + "learning_rate": 3.9094983318019584e-08, + "logits/chosen": -0.2138177454471588, + "logits/rejected": -0.10813770443201065, + "logps/chosen": -4.4357476234436035, + "logps/rejected": -5.829280376434326, + "loss": 0.4271, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.4357476234436035, + "rewards/margins": 1.3935325145721436, + "rewards/rejected": -5.829280376434326, + "sft_loss": 4.5532660484313965, + "step": 4965 + }, + { + "epoch": 2.6599765847131627, + "grad_norm": 18.7127890117876, + "learning_rate": 3.849350557809789e-08, + "logits/chosen": -0.13064627349376678, + "logits/rejected": -0.059014834463596344, + "logps/chosen": -4.3094706535339355, + "logps/rejected": -5.6937994956970215, + "loss": 0.4111, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.3094706535339355, + "rewards/margins": 1.3843294382095337, + "rewards/rejected": -5.6937994956970215, + "sft_loss": 4.256752014160156, + "step": 4970 + }, + { + "epoch": 2.6626526174945644, + "grad_norm": 22.453793570838283, + "learning_rate": 3.789650537620903e-08, + "logits/chosen": -0.16990868747234344, + "logits/rejected": -0.12744008004665375, + "logps/chosen": -4.531280040740967, + "logps/rejected": -5.8299784660339355, + "loss": 0.4399, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.531280040740967, + "rewards/margins": 1.2986981868743896, + "rewards/rejected": -5.8299784660339355, + "sft_loss": 4.526133060455322, + "step": 4975 + }, + { + "epoch": 2.665328650275966, + "grad_norm": 20.880815590545996, + "learning_rate": 3.730398850445182e-08, + "logits/chosen": -0.08516103774309158, + "logits/rejected": -0.051754094660282135, + "logps/chosen": -4.615936756134033, + "logps/rejected": -5.832221031188965, + "loss": 0.488, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.615936756134033, + "rewards/margins": 1.216284155845642, + "rewards/rejected": -5.832221031188965, + "sft_loss": 4.542150020599365, + "step": 4980 + }, + { + "epoch": 2.6680046830573674, + "grad_norm": 21.3886728116191, + "learning_rate": 3.671596071142735e-08, + "logits/chosen": -0.15201832354068756, + "logits/rejected": -0.011524543166160583, + "logps/chosen": -4.438300132751465, + "logps/rejected": -5.841962814331055, + "loss": 0.475, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.438300132751465, + "rewards/margins": 1.4036626815795898, + "rewards/rejected": -5.841962814331055, + "sft_loss": 4.412807464599609, + "step": 4985 + }, + { + "epoch": 2.670680715838769, + "grad_norm": 20.236923101464793, + "learning_rate": 3.6132427702183996e-08, + "logits/chosen": -0.22584716975688934, + "logits/rejected": -0.0964241698384285, + "logps/chosen": -4.3834710121154785, + "logps/rejected": -5.847751617431641, + "loss": 0.3923, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.3834710121154785, + "rewards/margins": 1.464280605316162, + "rewards/rejected": -5.847751617431641, + "sft_loss": 4.467441082000732, + "step": 4990 + }, + { + "epoch": 2.6733567486201704, + "grad_norm": 19.207220133232692, + "learning_rate": 3.555339513816147e-08, + "logits/chosen": -0.16925375163555145, + "logits/rejected": -0.14062543213367462, + "logps/chosen": -4.560122489929199, + "logps/rejected": -5.656733512878418, + "loss": 0.5064, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.560122489929199, + "rewards/margins": 1.0966103076934814, + "rewards/rejected": -5.656733512878418, + "sft_loss": 4.595259189605713, + "step": 4995 + }, + { + "epoch": 2.676032781401572, + "grad_norm": 20.535443637891856, + "learning_rate": 3.497886863713639e-08, + "logits/chosen": -0.18426719307899475, + "logits/rejected": -0.13645689189434052, + "logps/chosen": -4.579115867614746, + "logps/rejected": -5.923556327819824, + "loss": 0.4787, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.579115867614746, + "rewards/margins": 1.34443998336792, + "rewards/rejected": -5.923556327819824, + "sft_loss": 4.646383762359619, + "step": 5000 + }, + { + "epoch": 2.678708814182974, + "grad_norm": 28.422069720008892, + "learning_rate": 3.440885377316721e-08, + "logits/chosen": -0.13655778765678406, + "logits/rejected": -0.09209270775318146, + "logps/chosen": -4.541821479797363, + "logps/rejected": -5.684293270111084, + "loss": 0.4885, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.541821479797363, + "rewards/margins": 1.142472505569458, + "rewards/rejected": -5.684293270111084, + "sft_loss": 4.564027786254883, + "step": 5005 + }, + { + "epoch": 2.6813848469643755, + "grad_norm": 22.58653703001671, + "learning_rate": 3.384335607654082e-08, + "logits/chosen": -0.1473383754491806, + "logits/rejected": -0.06587468087673187, + "logps/chosen": -4.53951358795166, + "logps/rejected": -5.852242469787598, + "loss": 0.4315, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.53951358795166, + "rewards/margins": 1.3127288818359375, + "rewards/rejected": -5.852242469787598, + "sft_loss": 4.513645172119141, + "step": 5010 + }, + { + "epoch": 2.684060879745777, + "grad_norm": 20.623675139503522, + "learning_rate": 3.328238103371811e-08, + "logits/chosen": -0.1609375923871994, + "logits/rejected": -0.09451863914728165, + "logps/chosen": -4.543221950531006, + "logps/rejected": -5.874283313751221, + "loss": 0.446, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.543221950531006, + "rewards/margins": 1.3310611248016357, + "rewards/rejected": -5.874283313751221, + "sft_loss": 4.466743469238281, + "step": 5015 + }, + { + "epoch": 2.6867369125271785, + "grad_norm": 27.424395934018733, + "learning_rate": 3.272593408728169e-08, + "logits/chosen": -0.1978655755519867, + "logits/rejected": -0.022298278287053108, + "logps/chosen": -4.504530429840088, + "logps/rejected": -5.7812910079956055, + "loss": 0.4722, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.504530429840088, + "rewards/margins": 1.2767612934112549, + "rewards/rejected": -5.7812910079956055, + "sft_loss": 4.603614330291748, + "step": 5020 + }, + { + "epoch": 2.6894129453085798, + "grad_norm": 18.544009839500788, + "learning_rate": 3.217402063588204e-08, + "logits/chosen": -0.21366408467292786, + "logits/rejected": -0.09033291786909103, + "logps/chosen": -4.505580902099609, + "logps/rejected": -5.765819072723389, + "loss": 0.4661, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.505580902099609, + "rewards/margins": 1.2602382898330688, + "rewards/rejected": -5.765819072723389, + "sft_loss": 4.506991386413574, + "step": 5025 + }, + { + "epoch": 2.6920889780899815, + "grad_norm": 17.208970877155696, + "learning_rate": 3.162664603418608e-08, + "logits/chosen": -0.17819947004318237, + "logits/rejected": -0.10755584388971329, + "logps/chosen": -4.437621593475342, + "logps/rejected": -5.8965911865234375, + "loss": 0.42, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.437621593475342, + "rewards/margins": 1.4589693546295166, + "rewards/rejected": -5.8965911865234375, + "sft_loss": 4.461600303649902, + "step": 5030 + }, + { + "epoch": 2.694765010871383, + "grad_norm": 29.64941568431905, + "learning_rate": 3.1083815592824416e-08, + "logits/chosen": -0.20304307341575623, + "logits/rejected": -0.09568870812654495, + "logps/chosen": -4.634688854217529, + "logps/rejected": -5.94057559967041, + "loss": 0.4707, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.634688854217529, + "rewards/margins": 1.3058862686157227, + "rewards/rejected": -5.94057559967041, + "sft_loss": 4.67722225189209, + "step": 5035 + }, + { + "epoch": 2.697441043652785, + "grad_norm": 21.137086003538187, + "learning_rate": 3.054553457834053e-08, + "logits/chosen": -0.04583621770143509, + "logits/rejected": -0.04740852862596512, + "logps/chosen": -4.654051780700684, + "logps/rejected": -5.890873908996582, + "loss": 0.4632, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.654051780700684, + "rewards/margins": 1.2368220090866089, + "rewards/rejected": -5.890873908996582, + "sft_loss": 4.616780757904053, + "step": 5040 + }, + { + "epoch": 2.700117076434186, + "grad_norm": 23.607549620355137, + "learning_rate": 3.0011808213139036e-08, + "logits/chosen": -0.12733376026153564, + "logits/rejected": -0.1056627482175827, + "logps/chosen": -4.537964820861816, + "logps/rejected": -5.770925998687744, + "loss": 0.4473, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.537964820861816, + "rewards/margins": 1.232961654663086, + "rewards/rejected": -5.770925998687744, + "sft_loss": 4.539140224456787, + "step": 5045 + }, + { + "epoch": 2.702793109215588, + "grad_norm": 20.197067487892724, + "learning_rate": 2.948264167543568e-08, + "logits/chosen": -0.19815947115421295, + "logits/rejected": -0.1313100904226303, + "logps/chosen": -4.479102611541748, + "logps/rejected": -5.704444885253906, + "loss": 0.4442, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.479102611541748, + "rewards/margins": 1.225342035293579, + "rewards/rejected": -5.704444885253906, + "sft_loss": 4.477992534637451, + "step": 5050 + }, + { + "epoch": 2.7054691419969896, + "grad_norm": 20.31153163158568, + "learning_rate": 2.8958040099206216e-08, + "logits/chosen": -0.2582859992980957, + "logits/rejected": -0.16854605078697205, + "logps/chosen": -4.308932304382324, + "logps/rejected": -5.7119317054748535, + "loss": 0.4183, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.308932304382324, + "rewards/margins": 1.4029991626739502, + "rewards/rejected": -5.7119317054748535, + "sft_loss": 4.360212802886963, + "step": 5055 + }, + { + "epoch": 2.708145174778391, + "grad_norm": 27.64314575330156, + "learning_rate": 2.843800857413775e-08, + "logits/chosen": -0.17026914656162262, + "logits/rejected": -0.1082666888833046, + "logps/chosen": -4.433135986328125, + "logps/rejected": -5.6108222007751465, + "loss": 0.508, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.433135986328125, + "rewards/margins": 1.1776866912841797, + "rewards/rejected": -5.6108222007751465, + "sft_loss": 4.465333461761475, + "step": 5060 + }, + { + "epoch": 2.7108212075597926, + "grad_norm": 28.104949777755163, + "learning_rate": 2.7922552145578203e-08, + "logits/chosen": -0.19289958477020264, + "logits/rejected": -0.027002420276403427, + "logps/chosen": -4.320894241333008, + "logps/rejected": -5.665889263153076, + "loss": 0.4547, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.320894241333008, + "rewards/margins": 1.3449946641921997, + "rewards/rejected": -5.665889263153076, + "sft_loss": 4.365025520324707, + "step": 5065 + }, + { + "epoch": 2.7134972403411943, + "grad_norm": 24.584489114151147, + "learning_rate": 2.7411675814488277e-08, + "logits/chosen": -0.11105088144540787, + "logits/rejected": -0.0008586436742916703, + "logps/chosen": -4.451926231384277, + "logps/rejected": -5.640947341918945, + "loss": 0.4576, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.451926231384277, + "rewards/margins": 1.1890199184417725, + "rewards/rejected": -5.640947341918945, + "sft_loss": 4.59604549407959, + "step": 5070 + }, + { + "epoch": 2.7161732731225956, + "grad_norm": 24.071061668209328, + "learning_rate": 2.690538453739216e-08, + "logits/chosen": -0.14649274945259094, + "logits/rejected": -0.08841396123170853, + "logps/chosen": -4.365932941436768, + "logps/rejected": -5.378201961517334, + "loss": 0.5444, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.365932941436768, + "rewards/margins": 1.0122692584991455, + "rewards/rejected": -5.378201961517334, + "sft_loss": 4.456226348876953, + "step": 5075 + }, + { + "epoch": 2.7188493059039973, + "grad_norm": 20.194893373629437, + "learning_rate": 2.6403683226330298e-08, + "logits/chosen": -0.23665407299995422, + "logits/rejected": -0.11111712455749512, + "logps/chosen": -4.488452434539795, + "logps/rejected": -5.783753871917725, + "loss": 0.48, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.488452434539795, + "rewards/margins": 1.2953013181686401, + "rewards/rejected": -5.783753871917725, + "sft_loss": 4.546128273010254, + "step": 5080 + }, + { + "epoch": 2.721525338685399, + "grad_norm": 29.905678180384932, + "learning_rate": 2.5906576748810804e-08, + "logits/chosen": -0.20285113155841827, + "logits/rejected": -0.11245715618133545, + "logps/chosen": -4.482570171356201, + "logps/rejected": -6.00087833404541, + "loss": 0.3989, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.482570171356201, + "rewards/margins": 1.5183079242706299, + "rewards/rejected": -6.00087833404541, + "sft_loss": 4.5170183181762695, + "step": 5085 + }, + { + "epoch": 2.7242013714668003, + "grad_norm": 25.34663503129227, + "learning_rate": 2.5414069927763016e-08, + "logits/chosen": -0.23477861285209656, + "logits/rejected": -0.09957059472799301, + "logps/chosen": -4.484523296356201, + "logps/rejected": -5.851978778839111, + "loss": 0.4357, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.484523296356201, + "rewards/margins": 1.3674547672271729, + "rewards/rejected": -5.851978778839111, + "sft_loss": 4.524469375610352, + "step": 5090 + }, + { + "epoch": 2.726877404248202, + "grad_norm": 20.649410979346033, + "learning_rate": 2.4926167541490185e-08, + "logits/chosen": -0.2946515679359436, + "logits/rejected": -0.1434052437543869, + "logps/chosen": -4.407382965087891, + "logps/rejected": -5.828757286071777, + "loss": 0.4491, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.407382965087891, + "rewards/margins": 1.421373724937439, + "rewards/rejected": -5.828757286071777, + "sft_loss": 4.452073574066162, + "step": 5095 + }, + { + "epoch": 2.7295534370296037, + "grad_norm": 14.846615490916948, + "learning_rate": 2.4442874323623574e-08, + "logits/chosen": -0.12604109942913055, + "logits/rejected": -0.03208146244287491, + "logps/chosen": -4.457130432128906, + "logps/rejected": -5.801394462585449, + "loss": 0.4663, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.457130432128906, + "rewards/margins": 1.3442646265029907, + "rewards/rejected": -5.801394462585449, + "sft_loss": 4.512373447418213, + "step": 5100 + }, + { + "epoch": 2.7322294698110055, + "grad_norm": 23.775898636913556, + "learning_rate": 2.396419496307589e-08, + "logits/chosen": -0.17571856081485748, + "logits/rejected": -0.03184592351317406, + "logps/chosen": -4.659298419952393, + "logps/rejected": -5.972817420959473, + "loss": 0.4619, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.659298419952393, + "rewards/margins": 1.3135192394256592, + "rewards/rejected": -5.972817420959473, + "sft_loss": 4.618160724639893, + "step": 5105 + }, + { + "epoch": 2.7349055025924067, + "grad_norm": 21.048811512128385, + "learning_rate": 2.349013410399653e-08, + "logits/chosen": -0.2019394189119339, + "logits/rejected": -0.10773040354251862, + "logps/chosen": -4.420794486999512, + "logps/rejected": -5.719508171081543, + "loss": 0.4866, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.420794486999512, + "rewards/margins": 1.298714518547058, + "rewards/rejected": -5.719508171081543, + "sft_loss": 4.430346965789795, + "step": 5110 + }, + { + "epoch": 2.7375815353738084, + "grad_norm": 20.902772933512118, + "learning_rate": 2.3020696345725954e-08, + "logits/chosen": -0.2315623015165329, + "logits/rejected": -0.08020760864019394, + "logps/chosen": -4.5212626457214355, + "logps/rejected": -5.924075603485107, + "loss": 0.4086, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.5212626457214355, + "rewards/margins": 1.402813196182251, + "rewards/rejected": -5.924075603485107, + "sft_loss": 4.498833656311035, + "step": 5115 + }, + { + "epoch": 2.7402575681552097, + "grad_norm": 22.033682126642848, + "learning_rate": 2.2555886242751398e-08, + "logits/chosen": -0.18592925369739532, + "logits/rejected": -0.1098877415060997, + "logps/chosen": -4.457711219787598, + "logps/rejected": -5.814774513244629, + "loss": 0.4181, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.457711219787598, + "rewards/margins": 1.3570640087127686, + "rewards/rejected": -5.814774513244629, + "sft_loss": 4.481400966644287, + "step": 5120 + }, + { + "epoch": 2.7429336009366114, + "grad_norm": 26.65306589485683, + "learning_rate": 2.2095708304662453e-08, + "logits/chosen": -0.2802940607070923, + "logits/rejected": -0.09975512325763702, + "logps/chosen": -4.371427536010742, + "logps/rejected": -5.729374885559082, + "loss": 0.4328, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.371427536010742, + "rewards/margins": 1.3579468727111816, + "rewards/rejected": -5.729374885559082, + "sft_loss": 4.468634605407715, + "step": 5125 + }, + { + "epoch": 2.745609633718013, + "grad_norm": 22.924473762441067, + "learning_rate": 2.16401669961076e-08, + "logits/chosen": -0.25745660066604614, + "logits/rejected": -0.11118185520172119, + "logps/chosen": -4.527166366577148, + "logps/rejected": -5.842087268829346, + "loss": 0.4577, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.527166366577148, + "rewards/margins": 1.3149210214614868, + "rewards/rejected": -5.842087268829346, + "sft_loss": 4.613772392272949, + "step": 5130 + }, + { + "epoch": 2.748285666499415, + "grad_norm": 28.080862924064185, + "learning_rate": 2.1189266736750532e-08, + "logits/chosen": -0.10482903569936752, + "logits/rejected": -0.03856141120195389, + "logps/chosen": -4.515239715576172, + "logps/rejected": -5.718465805053711, + "loss": 0.4708, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.515239715576172, + "rewards/margins": 1.2032257318496704, + "rewards/rejected": -5.718465805053711, + "sft_loss": 4.546355247497559, + "step": 5135 + }, + { + "epoch": 2.750961699280816, + "grad_norm": 20.91587610599803, + "learning_rate": 2.0743011901227623e-08, + "logits/chosen": -0.13982251286506653, + "logits/rejected": -0.006131963804364204, + "logps/chosen": -4.486544609069824, + "logps/rejected": -5.8271355628967285, + "loss": 0.4315, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.486544609069824, + "rewards/margins": 1.3405910730361938, + "rewards/rejected": -5.8271355628967285, + "sft_loss": 4.430808067321777, + "step": 5140 + }, + { + "epoch": 2.753637732062218, + "grad_norm": 29.84654159466731, + "learning_rate": 2.030140681910508e-08, + "logits/chosen": -0.15242649614810944, + "logits/rejected": -0.048605434596538544, + "logps/chosen": -4.512834072113037, + "logps/rejected": -5.716804504394531, + "loss": 0.495, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.512834072113037, + "rewards/margins": 1.2039706707000732, + "rewards/rejected": -5.716804504394531, + "sft_loss": 4.546947479248047, + "step": 5145 + }, + { + "epoch": 2.756313764843619, + "grad_norm": 20.74155029479811, + "learning_rate": 1.986445577483753e-08, + "logits/chosen": -0.20727205276489258, + "logits/rejected": -0.11031541973352432, + "logps/chosen": -4.420102596282959, + "logps/rejected": -5.79683256149292, + "loss": 0.455, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.420102596282959, + "rewards/margins": 1.37673020362854, + "rewards/rejected": -5.79683256149292, + "sft_loss": 4.442891597747803, + "step": 5150 + }, + { + "epoch": 2.758989797625021, + "grad_norm": 17.627201565134865, + "learning_rate": 1.9432163007725765e-08, + "logits/chosen": -0.23875252902507782, + "logits/rejected": -0.16499489545822144, + "logps/chosen": -4.42592191696167, + "logps/rejected": -5.7121901512146, + "loss": 0.4581, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.42592191696167, + "rewards/margins": 1.2862684726715088, + "rewards/rejected": -5.7121901512146, + "sft_loss": 4.519508361816406, + "step": 5155 + }, + { + "epoch": 2.7616658304064226, + "grad_norm": 16.664853153492285, + "learning_rate": 1.9004532711876297e-08, + "logits/chosen": -0.20979630947113037, + "logits/rejected": -0.15912654995918274, + "logps/chosen": -4.3233184814453125, + "logps/rejected": -5.714047431945801, + "loss": 0.4377, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.3233184814453125, + "rewards/margins": 1.3907288312911987, + "rewards/rejected": -5.714047431945801, + "sft_loss": 4.3558807373046875, + "step": 5160 + }, + { + "epoch": 2.7643418631878243, + "grad_norm": 23.880807330166387, + "learning_rate": 1.8581569036159928e-08, + "logits/chosen": -0.20513634383678436, + "logits/rejected": -0.0711812898516655, + "logps/chosen": -4.392642021179199, + "logps/rejected": -5.718276500701904, + "loss": 0.4452, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.392642021179199, + "rewards/margins": 1.3256343603134155, + "rewards/rejected": -5.718276500701904, + "sft_loss": 4.4098711013793945, + "step": 5165 + }, + { + "epoch": 2.7670178959692255, + "grad_norm": 19.38565844548832, + "learning_rate": 1.8163276084172285e-08, + "logits/chosen": -0.1626313477754593, + "logits/rejected": -0.0420430526137352, + "logps/chosen": -4.573996067047119, + "logps/rejected": -5.955582141876221, + "loss": 0.4305, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.573996067047119, + "rewards/margins": 1.3815863132476807, + "rewards/rejected": -5.955582141876221, + "sft_loss": 4.616714000701904, + "step": 5170 + }, + { + "epoch": 2.7696939287506273, + "grad_norm": 19.94973916398742, + "learning_rate": 1.7749657914193194e-08, + "logits/chosen": -0.19201573729515076, + "logits/rejected": -0.09315890818834305, + "logps/chosen": -4.613008975982666, + "logps/rejected": -6.132317543029785, + "loss": 0.3862, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.613008975982666, + "rewards/margins": 1.5193077325820923, + "rewards/rejected": -6.132317543029785, + "sft_loss": 4.541656017303467, + "step": 5175 + }, + { + "epoch": 2.7723699615320285, + "grad_norm": 28.348181428348507, + "learning_rate": 1.7340718539148203e-08, + "logits/chosen": -0.15497739613056183, + "logits/rejected": -0.12709686160087585, + "logps/chosen": -4.606642723083496, + "logps/rejected": -5.7925705909729, + "loss": 0.4855, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.606642723083496, + "rewards/margins": 1.1859276294708252, + "rewards/rejected": -5.7925705909729, + "sft_loss": 4.743796348571777, + "step": 5180 + }, + { + "epoch": 2.7750459943134302, + "grad_norm": 19.21117198685213, + "learning_rate": 1.6936461926568724e-08, + "logits/chosen": -0.16036248207092285, + "logits/rejected": -0.06798889487981796, + "logps/chosen": -4.460104942321777, + "logps/rejected": -5.908174991607666, + "loss": 0.4599, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.460104942321777, + "rewards/margins": 1.4480699300765991, + "rewards/rejected": -5.908174991607666, + "sft_loss": 4.530155181884766, + "step": 5185 + }, + { + "epoch": 2.777722027094832, + "grad_norm": 18.868779898400874, + "learning_rate": 1.6536891998554346e-08, + "logits/chosen": -0.22829198837280273, + "logits/rejected": -0.08963672816753387, + "logps/chosen": -4.394246578216553, + "logps/rejected": -5.759983062744141, + "loss": 0.4229, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.394246578216553, + "rewards/margins": 1.365736722946167, + "rewards/rejected": -5.759983062744141, + "sft_loss": 4.44966983795166, + "step": 5190 + }, + { + "epoch": 2.7803980598762337, + "grad_norm": 23.987850542329372, + "learning_rate": 1.6142012631734093e-08, + "logits/chosen": -0.118210569024086, + "logits/rejected": 0.015512818470597267, + "logps/chosen": -4.505238056182861, + "logps/rejected": -5.804290771484375, + "loss": 0.4518, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.505238056182861, + "rewards/margins": 1.299053430557251, + "rewards/rejected": -5.804290771484375, + "sft_loss": 4.5167741775512695, + "step": 5195 + }, + { + "epoch": 2.783074092657635, + "grad_norm": 28.404369786995133, + "learning_rate": 1.575182765722949e-08, + "logits/chosen": -0.23681633174419403, + "logits/rejected": -0.10690312087535858, + "logps/chosen": -4.501443862915039, + "logps/rejected": -5.874948978424072, + "loss": 0.4404, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.501443862915039, + "rewards/margins": 1.3735042810440063, + "rewards/rejected": -5.874948978424072, + "sft_loss": 4.492587089538574, + "step": 5200 + }, + { + "epoch": 2.783074092657635, + "eval_logits/chosen": 0.004443091340363026, + "eval_logits/rejected": 0.08074604719877243, + "eval_logps/chosen": -4.662295341491699, + "eval_logps/rejected": -5.713897705078125, + "eval_loss": 0.5804030895233154, + "eval_rewards/accuracies": 0.7299703359603882, + "eval_rewards/chosen": -4.662295341491699, + "eval_rewards/margins": 1.0516023635864258, + "eval_rewards/rejected": -5.713897705078125, + "eval_runtime": 43.3695, + "eval_samples_per_second": 31.013, + "eval_sft_loss": 4.627921104431152, + "eval_steps_per_second": 7.77, + "step": 5200 + }, + { + "epoch": 2.7857501254390367, + "grad_norm": 14.254841170461553, + "learning_rate": 1.536634086061672e-08, + "logits/chosen": -0.1561775803565979, + "logits/rejected": -0.09926787763834, + "logps/chosen": -4.417360782623291, + "logps/rejected": -5.744927406311035, + "loss": 0.4445, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.417360782623291, + "rewards/margins": 1.327566385269165, + "rewards/rejected": -5.744927406311035, + "sft_loss": 4.419335842132568, + "step": 5205 + }, + { + "epoch": 2.788426158220438, + "grad_norm": 23.786728971169076, + "learning_rate": 1.4985555981890495e-08, + "logits/chosen": -0.16632170975208282, + "logits/rejected": -0.0739097073674202, + "logps/chosen": -4.5204010009765625, + "logps/rejected": -5.843813896179199, + "loss": 0.4654, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.5204010009765625, + "rewards/margins": 1.3234120607376099, + "rewards/rejected": -5.843813896179199, + "sft_loss": 4.5244293212890625, + "step": 5210 + }, + { + "epoch": 2.7911021910018396, + "grad_norm": 19.868166471320215, + "learning_rate": 1.4609476715427226e-08, + "logits/chosen": -0.15556737780570984, + "logits/rejected": -0.08350914716720581, + "logps/chosen": -4.369675636291504, + "logps/rejected": -5.737880229949951, + "loss": 0.4225, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.369675636291504, + "rewards/margins": 1.3682044744491577, + "rewards/rejected": -5.737880229949951, + "sft_loss": 4.400622844696045, + "step": 5215 + }, + { + "epoch": 2.7937782237832414, + "grad_norm": 21.003889345878, + "learning_rate": 1.4238106709949792e-08, + "logits/chosen": -0.22633075714111328, + "logits/rejected": -0.13380743563175201, + "logps/chosen": -4.498518943786621, + "logps/rejected": -6.063634395599365, + "loss": 0.3886, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.498518943786621, + "rewards/margins": 1.5651153326034546, + "rewards/rejected": -6.063634395599365, + "sft_loss": 4.527626991271973, + "step": 5220 + }, + { + "epoch": 2.796454256564643, + "grad_norm": 30.031111476221355, + "learning_rate": 1.3871449568491511e-08, + "logits/chosen": -0.19734984636306763, + "logits/rejected": -0.06000862643122673, + "logps/chosen": -4.527247428894043, + "logps/rejected": -5.8383283615112305, + "loss": 0.4602, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.527247428894043, + "rewards/margins": 1.3110812902450562, + "rewards/rejected": -5.8383283615112305, + "sft_loss": 4.496323108673096, + "step": 5225 + }, + { + "epoch": 2.7991302893460444, + "grad_norm": 13.79924264199127, + "learning_rate": 1.3509508848361606e-08, + "logits/chosen": -0.25380197167396545, + "logits/rejected": -0.14649678766727448, + "logps/chosen": -4.464987277984619, + "logps/rejected": -5.718521595001221, + "loss": 0.4444, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.464987277984619, + "rewards/margins": 1.2535343170166016, + "rewards/rejected": -5.718521595001221, + "sft_loss": 4.407475471496582, + "step": 5230 + }, + { + "epoch": 2.801806322127446, + "grad_norm": 17.07340604138416, + "learning_rate": 1.3152288061110517e-08, + "logits/chosen": -0.23314595222473145, + "logits/rejected": -0.10958156734704971, + "logps/chosen": -4.368558883666992, + "logps/rejected": -5.746652603149414, + "loss": 0.4283, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.368558883666992, + "rewards/margins": 1.3780934810638428, + "rewards/rejected": -5.746652603149414, + "sft_loss": 4.360150337219238, + "step": 5235 + }, + { + "epoch": 2.804482354908848, + "grad_norm": 23.518858818671433, + "learning_rate": 1.2799790672495814e-08, + "logits/chosen": -0.20950329303741455, + "logits/rejected": -0.06466137617826462, + "logps/chosen": -4.563027381896973, + "logps/rejected": -5.900126934051514, + "loss": 0.4538, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.563027381896973, + "rewards/margins": 1.337099552154541, + "rewards/rejected": -5.900126934051514, + "sft_loss": 4.570873737335205, + "step": 5240 + }, + { + "epoch": 2.807158387690249, + "grad_norm": 24.999816021866938, + "learning_rate": 1.2452020102448835e-08, + "logits/chosen": -0.1656816601753235, + "logits/rejected": -0.11390833556652069, + "logps/chosen": -4.472662925720215, + "logps/rejected": -5.7310919761657715, + "loss": 0.4565, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.472662925720215, + "rewards/margins": 1.2584285736083984, + "rewards/rejected": -5.7310919761657715, + "sft_loss": 4.561809062957764, + "step": 5245 + }, + { + "epoch": 2.8098344204716508, + "grad_norm": 29.623488406148326, + "learning_rate": 1.2108979725041103e-08, + "logits/chosen": -0.24879387021064758, + "logits/rejected": -0.10537517070770264, + "logps/chosen": -4.528120994567871, + "logps/rejected": -5.904018402099609, + "loss": 0.4535, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.528120994567871, + "rewards/margins": 1.3758974075317383, + "rewards/rejected": -5.904018402099609, + "sft_loss": 4.5284528732299805, + "step": 5250 + }, + { + "epoch": 2.8125104532530525, + "grad_norm": 24.91543026763308, + "learning_rate": 1.1770672868451958e-08, + "logits/chosen": -0.20283707976341248, + "logits/rejected": -0.04213991388678551, + "logps/chosen": -4.693485736846924, + "logps/rejected": -6.021272659301758, + "loss": 0.437, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.693485736846924, + "rewards/margins": 1.3277864456176758, + "rewards/rejected": -6.021272659301758, + "sft_loss": 4.669791221618652, + "step": 5255 + }, + { + "epoch": 2.8151864860344538, + "grad_norm": 27.77678531151836, + "learning_rate": 1.1437102814935872e-08, + "logits/chosen": -0.19460207223892212, + "logits/rejected": -0.14369544386863708, + "logps/chosen": -4.511590957641602, + "logps/rejected": -5.745457172393799, + "loss": 0.5007, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.511590957641602, + "rewards/margins": 1.233865737915039, + "rewards/rejected": -5.745457172393799, + "sft_loss": 4.625870704650879, + "step": 5260 + }, + { + "epoch": 2.8178625188158555, + "grad_norm": 16.198917525623145, + "learning_rate": 1.1108272800791018e-08, + "logits/chosen": -0.23445066809654236, + "logits/rejected": -0.07343915104866028, + "logps/chosen": -4.5098185539245605, + "logps/rejected": -5.809727668762207, + "loss": 0.4351, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.5098185539245605, + "rewards/margins": 1.2999083995819092, + "rewards/rejected": -5.809727668762207, + "sft_loss": 4.538376808166504, + "step": 5265 + }, + { + "epoch": 2.820538551597257, + "grad_norm": 19.862797168255526, + "learning_rate": 1.078418601632769e-08, + "logits/chosen": -0.12250743806362152, + "logits/rejected": -0.02016635611653328, + "logps/chosen": -4.527732849121094, + "logps/rejected": -5.898034572601318, + "loss": 0.4157, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.527732849121094, + "rewards/margins": 1.3703019618988037, + "rewards/rejected": -5.898034572601318, + "sft_loss": 4.592249870300293, + "step": 5270 + }, + { + "epoch": 2.8232145843786585, + "grad_norm": 17.504706416015686, + "learning_rate": 1.0464845605837159e-08, + "logits/chosen": -0.18533574044704437, + "logits/rejected": -0.06128733232617378, + "logps/chosen": -4.526907920837402, + "logps/rejected": -5.828028678894043, + "loss": 0.4167, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.526907920837402, + "rewards/margins": 1.3011205196380615, + "rewards/rejected": -5.828028678894043, + "sft_loss": 4.476199150085449, + "step": 5275 + }, + { + "epoch": 2.82589061716006, + "grad_norm": 17.491483364419327, + "learning_rate": 1.0150254667561642e-08, + "logits/chosen": -0.16981545090675354, + "logits/rejected": -0.03595195338129997, + "logps/chosen": -4.685810565948486, + "logps/rejected": -6.149535179138184, + "loss": 0.4297, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.685810565948486, + "rewards/margins": 1.463724136352539, + "rewards/rejected": -6.149535179138184, + "sft_loss": 4.639585494995117, + "step": 5280 + }, + { + "epoch": 2.828566649941462, + "grad_norm": 22.288334925361855, + "learning_rate": 9.840416253663719e-09, + "logits/chosen": -0.21739113330841064, + "logits/rejected": -0.10404038429260254, + "logps/chosen": -4.522845268249512, + "logps/rejected": -5.9513468742370605, + "loss": 0.4301, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.522845268249512, + "rewards/margins": 1.4285022020339966, + "rewards/rejected": -5.9513468742370605, + "sft_loss": 4.524170875549316, + "step": 5285 + }, + { + "epoch": 2.8312426827228636, + "grad_norm": 22.12734147118357, + "learning_rate": 9.535333370197074e-09, + "logits/chosen": -0.2011188268661499, + "logits/rejected": -0.08531059324741364, + "logps/chosen": -4.478837013244629, + "logps/rejected": -5.821646213531494, + "loss": 0.437, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.478837013244629, + "rewards/margins": 1.3428093194961548, + "rewards/rejected": -5.821646213531494, + "sft_loss": 4.5520429611206055, + "step": 5290 + }, + { + "epoch": 2.833918715504265, + "grad_norm": 17.345638013409147, + "learning_rate": 9.23500897707713e-09, + "logits/chosen": -0.2327616959810257, + "logits/rejected": -0.09521780163049698, + "logps/chosen": -4.699143409729004, + "logps/rejected": -6.117920398712158, + "loss": 0.441, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.699143409729004, + "rewards/margins": 1.4187771081924438, + "rewards/rejected": -6.117920398712158, + "sft_loss": 4.68923282623291, + "step": 5295 + }, + { + "epoch": 2.8365947482856666, + "grad_norm": 22.487036504680443, + "learning_rate": 8.939445988052574e-09, + "logits/chosen": -0.2045392543077469, + "logits/rejected": -0.12336097657680511, + "logps/chosen": -4.553119659423828, + "logps/rejected": -5.9328413009643555, + "loss": 0.4254, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.553119659423828, + "rewards/margins": 1.3797214031219482, + "rewards/rejected": -5.9328413009643555, + "sft_loss": 4.5086469650268555, + "step": 5300 + }, + { + "epoch": 2.839270781067068, + "grad_norm": 32.11819853883587, + "learning_rate": 8.648647270676656e-09, + "logits/chosen": -0.205168679356575, + "logits/rejected": -0.1323503851890564, + "logps/chosen": -4.535611152648926, + "logps/rejected": -5.771416187286377, + "loss": 0.4884, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.535611152648926, + "rewards/margins": 1.235804796218872, + "rewards/rejected": -5.771416187286377, + "sft_loss": 4.574914455413818, + "step": 5305 + }, + { + "epoch": 2.8419468138484696, + "grad_norm": 16.610425842713934, + "learning_rate": 8.362615646279991e-09, + "logits/chosen": -0.27479854226112366, + "logits/rejected": -0.12086167186498642, + "logps/chosen": -4.571263313293457, + "logps/rejected": -6.1058855056762695, + "loss": 0.4631, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.571263313293457, + "rewards/margins": 1.5346229076385498, + "rewards/rejected": -6.1058855056762695, + "sft_loss": 4.600201606750488, + "step": 5310 + }, + { + "epoch": 2.8446228466298713, + "grad_norm": 23.574681868463003, + "learning_rate": 8.081353889942466e-09, + "logits/chosen": -0.1529221087694168, + "logits/rejected": -0.018142305314540863, + "logps/chosen": -4.53646993637085, + "logps/rejected": -5.736028671264648, + "loss": 0.4657, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.53646993637085, + "rewards/margins": 1.1995582580566406, + "rewards/rejected": -5.736028671264648, + "sft_loss": 4.598328113555908, + "step": 5315 + }, + { + "epoch": 2.847298879411273, + "grad_norm": 20.776939138193498, + "learning_rate": 7.804864730467042e-09, + "logits/chosen": -0.14258281886577606, + "logits/rejected": -0.08337229490280151, + "logps/chosen": -4.521661281585693, + "logps/rejected": -5.844458103179932, + "loss": 0.4226, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.521661281585693, + "rewards/margins": 1.3227964639663696, + "rewards/rejected": -5.844458103179932, + "sft_loss": 4.469352722167969, + "step": 5320 + }, + { + "epoch": 2.8499749121926743, + "grad_norm": 17.319274589751448, + "learning_rate": 7.533150850352665e-09, + "logits/chosen": -0.19008512794971466, + "logits/rejected": -0.04209180548787117, + "logps/chosen": -4.5797271728515625, + "logps/rejected": -6.05983829498291, + "loss": 0.4024, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.5797271728515625, + "rewards/margins": 1.4801113605499268, + "rewards/rejected": -6.05983829498291, + "sft_loss": 4.614199638366699, + "step": 5325 + }, + { + "epoch": 2.852650944974076, + "grad_norm": 24.1489639271008, + "learning_rate": 7.2662148857686175e-09, + "logits/chosen": -0.14118270576000214, + "logits/rejected": -0.0662260577082634, + "logps/chosen": -4.535606384277344, + "logps/rejected": -5.975742340087891, + "loss": 0.4511, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.535606384277344, + "rewards/margins": 1.4401354789733887, + "rewards/rejected": -5.975742340087891, + "sft_loss": 4.580137252807617, + "step": 5330 + }, + { + "epoch": 2.8553269777554773, + "grad_norm": 23.231064740110014, + "learning_rate": 7.0040594265287635e-09, + "logits/chosen": -0.13686378300189972, + "logits/rejected": -0.15220175683498383, + "logps/chosen": -4.509222984313965, + "logps/rejected": -5.577731132507324, + "loss": 0.5055, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.509222984313965, + "rewards/margins": 1.0685081481933594, + "rewards/rejected": -5.577731132507324, + "sft_loss": 4.560286521911621, + "step": 5335 + }, + { + "epoch": 2.858003010536879, + "grad_norm": 19.238118519796025, + "learning_rate": 6.746687016066566e-09, + "logits/chosen": -0.1739339679479599, + "logits/rejected": -0.10641157627105713, + "logps/chosen": -4.436322212219238, + "logps/rejected": -5.848782539367676, + "loss": 0.4311, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.436322212219238, + "rewards/margins": 1.4124599695205688, + "rewards/rejected": -5.848782539367676, + "sft_loss": 4.444838523864746, + "step": 5340 + }, + { + "epoch": 2.8606790433182807, + "grad_norm": 17.343251096745842, + "learning_rate": 6.494100151410276e-09, + "logits/chosen": -0.27960461378097534, + "logits/rejected": -0.1422613561153412, + "logps/chosen": -4.423908710479736, + "logps/rejected": -5.740886688232422, + "loss": 0.4248, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.423908710479736, + "rewards/margins": 1.3169782161712646, + "rewards/rejected": -5.740886688232422, + "sft_loss": 4.494323253631592, + "step": 5345 + }, + { + "epoch": 2.8633550760996824, + "grad_norm": 20.911071402007103, + "learning_rate": 6.246301283158728e-09, + "logits/chosen": -0.1199701800942421, + "logits/rejected": -0.10320685058832169, + "logps/chosen": -4.56076717376709, + "logps/rejected": -5.661184787750244, + "loss": 0.5211, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.56076717376709, + "rewards/margins": 1.100416660308838, + "rewards/rejected": -5.661184787750244, + "sft_loss": 4.549746513366699, + "step": 5350 + }, + { + "epoch": 2.8660311088810837, + "grad_norm": 17.127061328827036, + "learning_rate": 6.0032928154576944e-09, + "logits/chosen": -0.1598750650882721, + "logits/rejected": -0.08833174407482147, + "logps/chosen": -4.526861667633057, + "logps/rejected": -5.655618190765381, + "loss": 0.4739, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.526861667633057, + "rewards/margins": 1.1287572383880615, + "rewards/rejected": -5.655618190765381, + "sft_loss": 4.569698333740234, + "step": 5355 + }, + { + "epoch": 2.8687071416624854, + "grad_norm": 23.43981155553961, + "learning_rate": 5.76507710597629e-09, + "logits/chosen": -0.2364230901002884, + "logits/rejected": -0.11034750938415527, + "logps/chosen": -4.5471601486206055, + "logps/rejected": -5.74447774887085, + "loss": 0.4807, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.5471601486206055, + "rewards/margins": 1.1973185539245605, + "rewards/rejected": -5.74447774887085, + "sft_loss": 4.5481438636779785, + "step": 5360 + }, + { + "epoch": 2.8713831744438867, + "grad_norm": 15.1758889330313, + "learning_rate": 5.531656465884438e-09, + "logits/chosen": -0.2452334612607956, + "logits/rejected": -0.14978325366973877, + "logps/chosen": -4.424814701080322, + "logps/rejected": -5.862797737121582, + "loss": 0.4287, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.424814701080322, + "rewards/margins": 1.4379831552505493, + "rewards/rejected": -5.862797737121582, + "sft_loss": 4.417214393615723, + "step": 5365 + }, + { + "epoch": 2.8740592072252884, + "grad_norm": 30.66560852395439, + "learning_rate": 5.303033159830217e-09, + "logits/chosen": -0.10269834846258163, + "logits/rejected": -0.06337957084178925, + "logps/chosen": -4.571722984313965, + "logps/rejected": -5.661180019378662, + "loss": 0.5119, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.571722984313965, + "rewards/margins": 1.0894571542739868, + "rewards/rejected": -5.661180019378662, + "sft_loss": 4.687636375427246, + "step": 5370 + }, + { + "epoch": 2.87673524000669, + "grad_norm": 23.770599957503787, + "learning_rate": 5.079209405917939e-09, + "logits/chosen": -0.1984288990497589, + "logits/rejected": -0.1162128821015358, + "logps/chosen": -4.328442096710205, + "logps/rejected": -6.042115688323975, + "loss": 0.3941, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.328442096710205, + "rewards/margins": 1.7136739492416382, + "rewards/rejected": -6.042115688323975, + "sft_loss": 4.449916362762451, + "step": 5375 + }, + { + "epoch": 2.879411272788092, + "grad_norm": 22.056122132564443, + "learning_rate": 4.860187375686664e-09, + "logits/chosen": -0.22836804389953613, + "logits/rejected": -0.05655550956726074, + "logps/chosen": -4.483396053314209, + "logps/rejected": -5.878445148468018, + "loss": 0.4264, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.483396053314209, + "rewards/margins": 1.3950486183166504, + "rewards/rejected": -5.878445148468018, + "sft_loss": 4.536786079406738, + "step": 5380 + }, + { + "epoch": 2.882087305569493, + "grad_norm": 13.671372307202047, + "learning_rate": 4.64596919408905e-09, + "logits/chosen": -0.15236499905586243, + "logits/rejected": -0.08293196558952332, + "logps/chosen": -4.356810569763184, + "logps/rejected": -5.7592902183532715, + "loss": 0.4255, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.356810569763184, + "rewards/margins": 1.4024803638458252, + "rewards/rejected": -5.7592902183532715, + "sft_loss": 4.476162433624268, + "step": 5385 + }, + { + "epoch": 2.884763338350895, + "grad_norm": 21.38253964489955, + "learning_rate": 4.436556939470814e-09, + "logits/chosen": -0.20900790393352509, + "logits/rejected": -0.08444569259881973, + "logps/chosen": -4.661978244781494, + "logps/rejected": -5.805869102478027, + "loss": 0.4992, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.661978244781494, + "rewards/margins": 1.1438905000686646, + "rewards/rejected": -5.805869102478027, + "sft_loss": 4.756087303161621, + "step": 5390 + }, + { + "epoch": 2.887439371132296, + "grad_norm": 20.906677985013843, + "learning_rate": 4.23195264355064e-09, + "logits/chosen": -0.3279460668563843, + "logits/rejected": -0.16175897419452667, + "logps/chosen": -4.424436569213867, + "logps/rejected": -5.78138542175293, + "loss": 0.4469, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.424436569213867, + "rewards/margins": 1.3569484949111938, + "rewards/rejected": -5.78138542175293, + "sft_loss": 4.508726119995117, + "step": 5395 + }, + { + "epoch": 2.890115403913698, + "grad_norm": 21.798446467514992, + "learning_rate": 4.032158291400245e-09, + "logits/chosen": -0.21985670924186707, + "logits/rejected": -0.03419061750173569, + "logps/chosen": -4.344112396240234, + "logps/rejected": -6.010276794433594, + "loss": 0.3772, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.344112396240234, + "rewards/margins": 1.6661643981933594, + "rewards/rejected": -6.010276794433594, + "sft_loss": 4.2952728271484375, + "step": 5400 + }, + { + "epoch": 2.8927914366950995, + "grad_norm": 16.7135439809327, + "learning_rate": 3.837175821425398e-09, + "logits/chosen": -0.15331804752349854, + "logits/rejected": -0.09507913887500763, + "logps/chosen": -4.596510410308838, + "logps/rejected": -5.7885541915893555, + "loss": 0.4997, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.596510410308838, + "rewards/margins": 1.192043423652649, + "rewards/rejected": -5.7885541915893555, + "sft_loss": 4.558037281036377, + "step": 5405 + }, + { + "epoch": 2.8954674694765012, + "grad_norm": 14.363309540256903, + "learning_rate": 3.6470071253467683e-09, + "logits/chosen": -0.19555512070655823, + "logits/rejected": -0.10160569846630096, + "logps/chosen": -4.516275405883789, + "logps/rejected": -6.0450921058654785, + "loss": 0.4521, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.516275405883789, + "rewards/margins": 1.5288162231445312, + "rewards/rejected": -6.0450921058654785, + "sft_loss": 4.563652038574219, + "step": 5410 + }, + { + "epoch": 2.8981435022579025, + "grad_norm": 15.761856304056796, + "learning_rate": 3.461654048181939e-09, + "logits/chosen": -0.20529595017433167, + "logits/rejected": -0.04511453956365585, + "logps/chosen": -4.611117839813232, + "logps/rejected": -5.723728179931641, + "loss": 0.4991, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.611117839813232, + "rewards/margins": 1.112610936164856, + "rewards/rejected": -5.723728179931641, + "sft_loss": 4.745556354522705, + "step": 5415 + }, + { + "epoch": 2.9008195350393042, + "grad_norm": 18.519294928115645, + "learning_rate": 3.281118388227255e-09, + "logits/chosen": -0.17930540442466736, + "logits/rejected": -0.12331312894821167, + "logps/chosen": -4.547412872314453, + "logps/rejected": -5.6964521408081055, + "loss": 0.5089, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.547412872314453, + "rewards/margins": 1.1490387916564941, + "rewards/rejected": -5.6964521408081055, + "sft_loss": 4.611541748046875, + "step": 5420 + }, + { + "epoch": 2.903495567820706, + "grad_norm": 19.685910361653935, + "learning_rate": 3.1054018970405048e-09, + "logits/chosen": -0.18745280802249908, + "logits/rejected": -0.11353820562362671, + "logps/chosen": -4.590461730957031, + "logps/rejected": -6.08115291595459, + "loss": 0.4268, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.590461730957031, + "rewards/margins": 1.4906920194625854, + "rewards/rejected": -6.08115291595459, + "sft_loss": 4.633131504058838, + "step": 5425 + }, + { + "epoch": 2.906171600602107, + "grad_norm": 18.955138875512645, + "learning_rate": 2.9345062794238207e-09, + "logits/chosen": -0.2041642665863037, + "logits/rejected": -0.051218412816524506, + "logps/chosen": -4.56999397277832, + "logps/rejected": -5.976227760314941, + "loss": 0.4072, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.56999397277832, + "rewards/margins": 1.406233310699463, + "rewards/rejected": -5.976227760314941, + "sft_loss": 4.611903667449951, + "step": 5430 + }, + { + "epoch": 2.908847633383509, + "grad_norm": 20.24734901216716, + "learning_rate": 2.7684331934072492e-09, + "logits/chosen": -0.2677224576473236, + "logits/rejected": -0.19448812305927277, + "logps/chosen": -4.410407066345215, + "logps/rejected": -5.844793796539307, + "loss": 0.4192, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.410407066345215, + "rewards/margins": 1.434386968612671, + "rewards/rejected": -5.844793796539307, + "sft_loss": 4.4703145027160645, + "step": 5435 + }, + { + "epoch": 2.9115236661649107, + "grad_norm": 17.967450509751167, + "learning_rate": 2.6071842502326526e-09, + "logits/chosen": -0.21685071289539337, + "logits/rejected": -0.11500433832406998, + "logps/chosen": -4.51969575881958, + "logps/rejected": -5.686825275421143, + "loss": 0.4601, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.51969575881958, + "rewards/margins": 1.1671292781829834, + "rewards/rejected": -5.686825275421143, + "sft_loss": 4.533324718475342, + "step": 5440 + }, + { + "epoch": 2.9141996989463124, + "grad_norm": 21.95311126238352, + "learning_rate": 2.450761014337888e-09, + "logits/chosen": -0.06685139238834381, + "logits/rejected": -0.013647640123963356, + "logps/chosen": -4.410046100616455, + "logps/rejected": -5.914144992828369, + "loss": 0.4668, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.410046100616455, + "rewards/margins": 1.504098892211914, + "rewards/rejected": -5.914144992828369, + "sft_loss": 4.446289539337158, + "step": 5445 + }, + { + "epoch": 2.9168757317277136, + "grad_norm": 27.19157248100409, + "learning_rate": 2.299165003341985e-09, + "logits/chosen": -0.08666609227657318, + "logits/rejected": -0.014598068781197071, + "logps/chosen": -4.55157995223999, + "logps/rejected": -5.894360542297363, + "loss": 0.4501, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.55157995223999, + "rewards/margins": 1.3427811861038208, + "rewards/rejected": -5.894360542297363, + "sft_loss": 4.572675704956055, + "step": 5450 + }, + { + "epoch": 2.9195517645091154, + "grad_norm": 18.407918261075594, + "learning_rate": 2.1523976880299945e-09, + "logits/chosen": -0.2196597307920456, + "logits/rejected": -0.06329698115587234, + "logps/chosen": -4.536431789398193, + "logps/rejected": -5.69393253326416, + "loss": 0.4873, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.536431789398193, + "rewards/margins": 1.157500982284546, + "rewards/rejected": -5.69393253326416, + "sft_loss": 4.546270847320557, + "step": 5455 + }, + { + "epoch": 2.9222277972905166, + "grad_norm": 13.718386083706063, + "learning_rate": 2.010460492339161e-09, + "logits/chosen": -0.23244929313659668, + "logits/rejected": -0.10611079633235931, + "logps/chosen": -4.383098602294922, + "logps/rejected": -5.782674312591553, + "loss": 0.4394, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.383098602294922, + "rewards/margins": 1.3995754718780518, + "rewards/rejected": -5.782674312591553, + "sft_loss": 4.387734889984131, + "step": 5460 + }, + { + "epoch": 2.9249038300719183, + "grad_norm": 13.546582964970408, + "learning_rate": 1.8733547933446614e-09, + "logits/chosen": -0.23045757412910461, + "logits/rejected": -0.06325678527355194, + "logps/chosen": -4.604036808013916, + "logps/rejected": -5.757791042327881, + "loss": 0.4742, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.604036808013916, + "rewards/margins": 1.1537543535232544, + "rewards/rejected": -5.757791042327881, + "sft_loss": 4.583401679992676, + "step": 5465 + }, + { + "epoch": 2.92757986285332, + "grad_norm": 31.614601842690373, + "learning_rate": 1.7410819212467231e-09, + "logits/chosen": -0.17662116885185242, + "logits/rejected": -0.09832396358251572, + "logps/chosen": -4.517539024353027, + "logps/rejected": -5.673015117645264, + "loss": 0.5014, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.517539024353027, + "rewards/margins": 1.1554756164550781, + "rewards/rejected": -5.673015117645264, + "sft_loss": 4.618490695953369, + "step": 5470 + }, + { + "epoch": 2.9302558956347218, + "grad_norm": 20.929350242079916, + "learning_rate": 1.613643159357192e-09, + "logits/chosen": -0.15189789235591888, + "logits/rejected": -0.18337905406951904, + "logps/chosen": -4.477793216705322, + "logps/rejected": -5.649487495422363, + "loss": 0.483, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.477793216705322, + "rewards/margins": 1.1716941595077515, + "rewards/rejected": -5.649487495422363, + "sft_loss": 4.517508029937744, + "step": 5475 + }, + { + "epoch": 2.932931928416123, + "grad_norm": 20.138531932920706, + "learning_rate": 1.4910397440875967e-09, + "logits/chosen": -0.20178821682929993, + "logits/rejected": -0.10650360584259033, + "logps/chosen": -4.510470390319824, + "logps/rejected": -5.833439826965332, + "loss": 0.461, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.510470390319824, + "rewards/margins": 1.3229695558547974, + "rewards/rejected": -5.833439826965332, + "sft_loss": 4.496905326843262, + "step": 5480 + }, + { + "epoch": 2.9356079611975248, + "grad_norm": 19.23378663117495, + "learning_rate": 1.3732728649368253e-09, + "logits/chosen": -0.15263617038726807, + "logits/rejected": -0.020247094333171844, + "logps/chosen": -4.409241676330566, + "logps/rejected": -5.532423496246338, + "loss": 0.4561, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.409241676330566, + "rewards/margins": 1.1231818199157715, + "rewards/rejected": -5.532423496246338, + "sft_loss": 4.408474445343018, + "step": 5485 + }, + { + "epoch": 2.938283993978926, + "grad_norm": 23.308633524253832, + "learning_rate": 1.260343664479524e-09, + "logits/chosen": -0.22360029816627502, + "logits/rejected": -0.17736531794071198, + "logps/chosen": -4.466728210449219, + "logps/rejected": -5.737008094787598, + "loss": 0.4687, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.466728210449219, + "rewards/margins": 1.2702802419662476, + "rewards/rejected": -5.737008094787598, + "sft_loss": 4.493401527404785, + "step": 5490 + }, + { + "epoch": 2.9409600267603278, + "grad_norm": 16.189471930127553, + "learning_rate": 1.1522532383554384e-09, + "logits/chosen": -0.19785809516906738, + "logits/rejected": -0.04793568700551987, + "logps/chosen": -4.383427619934082, + "logps/rejected": -5.96859884262085, + "loss": 0.386, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.383427619934082, + "rewards/margins": 1.5851705074310303, + "rewards/rejected": -5.96859884262085, + "sft_loss": 4.509185791015625, + "step": 5495 + }, + { + "epoch": 2.9436360595417295, + "grad_norm": 17.670331891374648, + "learning_rate": 1.049002635258256e-09, + "logits/chosen": -0.10283637046813965, + "logits/rejected": -0.01070125587284565, + "logps/chosen": -4.565319538116455, + "logps/rejected": -5.777436256408691, + "loss": 0.4657, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.565319538116455, + "rewards/margins": 1.2121164798736572, + "rewards/rejected": -5.777436256408691, + "sft_loss": 4.566296577453613, + "step": 5500 + }, + { + "epoch": 2.946312092323131, + "grad_norm": 25.038050542939718, + "learning_rate": 9.505928569258358e-10, + "logits/chosen": -0.16167476773262024, + "logits/rejected": -0.13245835900306702, + "logps/chosen": -4.51481294631958, + "logps/rejected": -5.77095365524292, + "loss": 0.4608, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.51481294631958, + "rewards/margins": 1.256141185760498, + "rewards/rejected": -5.77095365524292, + "sft_loss": 4.587597370147705, + "step": 5505 + }, + { + "epoch": 2.9489881251045325, + "grad_norm": 20.326158561359126, + "learning_rate": 8.57024858130273e-10, + "logits/chosen": -0.24194279313087463, + "logits/rejected": -0.1200401782989502, + "logps/chosen": -4.5456061363220215, + "logps/rejected": -6.069350242614746, + "loss": 0.4157, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.5456061363220215, + "rewards/margins": 1.5237447023391724, + "rewards/rejected": -6.069350242614746, + "sft_loss": 4.527360439300537, + "step": 5510 + }, + { + "epoch": 2.951664157885934, + "grad_norm": 32.48053677381899, + "learning_rate": 7.682995466686826e-10, + "logits/chosen": -0.2554609477519989, + "logits/rejected": -0.14878250658512115, + "logps/chosen": -4.514296531677246, + "logps/rejected": -5.8708391189575195, + "loss": 0.471, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.514296531677246, + "rewards/margins": 1.356541395187378, + "rewards/rejected": -5.8708391189575195, + "sft_loss": 4.559177875518799, + "step": 5515 + }, + { + "epoch": 2.9543401906673354, + "grad_norm": 20.44644783764318, + "learning_rate": 6.844177833543741e-10, + "logits/chosen": -0.18892745673656464, + "logits/rejected": -0.13082177937030792, + "logps/chosen": -4.487765312194824, + "logps/rejected": -5.767231464385986, + "loss": 0.436, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.487765312194824, + "rewards/margins": 1.2794665098190308, + "rewards/rejected": -5.767231464385986, + "sft_loss": 4.460915565490723, + "step": 5520 + }, + { + "epoch": 2.957016223448737, + "grad_norm": 19.29951548663288, + "learning_rate": 6.053803820087467e-10, + "logits/chosen": -0.21081380546092987, + "logits/rejected": -0.08385033160448074, + "logps/chosen": -4.683680534362793, + "logps/rejected": -6.0807600021362305, + "loss": 0.4437, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.683680534362793, + "rewards/margins": 1.3970798254013062, + "rewards/rejected": -6.0807600021362305, + "sft_loss": 4.776706695556641, + "step": 5525 + }, + { + "epoch": 2.959692256230139, + "grad_norm": 17.770995311277893, + "learning_rate": 5.311881094528514e-10, + "logits/chosen": -0.2691715359687805, + "logits/rejected": -0.11873998492956161, + "logps/chosen": -4.626561641693115, + "logps/rejected": -5.735688209533691, + "loss": 0.4969, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.626561641693115, + "rewards/margins": 1.109127402305603, + "rewards/rejected": -5.735688209533691, + "sft_loss": 4.608339786529541, + "step": 5530 + }, + { + "epoch": 2.9623682890115406, + "grad_norm": 23.104121952800956, + "learning_rate": 4.6184168550050806e-10, + "logits/chosen": -0.2036346197128296, + "logits/rejected": -0.13959690928459167, + "logps/chosen": -4.6203203201293945, + "logps/rejected": -5.855708122253418, + "loss": 0.485, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.6203203201293945, + "rewards/margins": 1.235388159751892, + "rewards/rejected": -5.855708122253418, + "sft_loss": 4.670767307281494, + "step": 5535 + }, + { + "epoch": 2.965044321792942, + "grad_norm": 22.17772707759298, + "learning_rate": 3.973417829510328e-10, + "logits/chosen": -0.2483900487422943, + "logits/rejected": -0.14307789504528046, + "logps/chosen": -4.501260280609131, + "logps/rejected": -5.735810279846191, + "loss": 0.4689, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.501260280609131, + "rewards/margins": 1.23455011844635, + "rewards/rejected": -5.735810279846191, + "sft_loss": 4.489443302154541, + "step": 5540 + }, + { + "epoch": 2.9677203545743436, + "grad_norm": 20.034360313400903, + "learning_rate": 3.3768902758274377e-10, + "logits/chosen": -0.15290217101573944, + "logits/rejected": -0.06998590379953384, + "logps/chosen": -4.484408378601074, + "logps/rejected": -5.721133708953857, + "loss": 0.4558, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.484408378601074, + "rewards/margins": 1.2367255687713623, + "rewards/rejected": -5.721133708953857, + "sft_loss": 4.4762349128723145, + "step": 5545 + }, + { + "epoch": 2.970396387355745, + "grad_norm": 18.00377929861334, + "learning_rate": 2.8288399814691e-10, + "logits/chosen": -0.12864898145198822, + "logits/rejected": -0.06968877464532852, + "logps/chosen": -4.501049041748047, + "logps/rejected": -5.645022392272949, + "loss": 0.4559, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.501049041748047, + "rewards/margins": 1.1439731121063232, + "rewards/rejected": -5.645022392272949, + "sft_loss": 4.486703395843506, + "step": 5550 + }, + { + "epoch": 2.9730724201371466, + "grad_norm": 23.72032930756754, + "learning_rate": 2.3292722636220066e-10, + "logits/chosen": -0.2211860716342926, + "logits/rejected": -0.08631716668605804, + "logps/chosen": -4.508865833282471, + "logps/rejected": -5.988263130187988, + "loss": 0.4365, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.508865833282471, + "rewards/margins": 1.4793974161148071, + "rewards/rejected": -5.988263130187988, + "sft_loss": 4.437670707702637, + "step": 5555 + }, + { + "epoch": 2.9757484529185483, + "grad_norm": 20.951770949095856, + "learning_rate": 1.8781919690946668e-10, + "logits/chosen": -0.18464908003807068, + "logits/rejected": -0.1366146057844162, + "logps/chosen": -4.533401966094971, + "logps/rejected": -5.666837215423584, + "loss": 0.4905, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.533401966094971, + "rewards/margins": 1.1334350109100342, + "rewards/rejected": -5.666837215423584, + "sft_loss": 4.637621879577637, + "step": 5560 + }, + { + "epoch": 2.97842448569995, + "grad_norm": 22.224432062005373, + "learning_rate": 1.4756034742696711e-10, + "logits/chosen": -0.21795520186424255, + "logits/rejected": -0.12108222395181656, + "logps/chosen": -4.572412014007568, + "logps/rejected": -5.863051414489746, + "loss": 0.4725, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.572412014007568, + "rewards/margins": 1.29063880443573, + "rewards/rejected": -5.863051414489746, + "sft_loss": 4.595419883728027, + "step": 5565 + }, + { + "epoch": 2.9811005184813513, + "grad_norm": 18.77963189428479, + "learning_rate": 1.12151068506261e-10, + "logits/chosen": -0.19824708998203278, + "logits/rejected": -0.09077189117670059, + "logps/chosen": -4.486553192138672, + "logps/rejected": -6.102303504943848, + "loss": 0.4185, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.486553192138672, + "rewards/margins": 1.6157506704330444, + "rewards/rejected": -6.102303504943848, + "sft_loss": 4.467534065246582, + "step": 5570 + }, + { + "epoch": 2.983776551262753, + "grad_norm": 18.796983598911968, + "learning_rate": 8.159170368826629e-11, + "logits/chosen": -0.2092389166355133, + "logits/rejected": -0.08719463646411896, + "logps/chosen": -4.276993751525879, + "logps/rejected": -5.6778717041015625, + "loss": 0.4692, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.276993751525879, + "rewards/margins": 1.4008785486221313, + "rewards/rejected": -5.6778717041015625, + "sft_loss": 4.343761920928955, + "step": 5575 + }, + { + "epoch": 2.9864525840441547, + "grad_norm": 24.112545714969144, + "learning_rate": 5.588254946015114e-11, + "logits/chosen": -0.25244978070259094, + "logits/rejected": -0.06573184579610825, + "logps/chosen": -4.505316257476807, + "logps/rejected": -5.831053733825684, + "loss": 0.4599, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.505316257476807, + "rewards/margins": 1.3257373571395874, + "rewards/rejected": -5.831053733825684, + "sft_loss": 4.507748126983643, + "step": 5580 + }, + { + "epoch": 2.989128616825556, + "grad_norm": 17.207660074491685, + "learning_rate": 3.502385525216978e-11, + "logits/chosen": -0.25891369581222534, + "logits/rejected": -0.12769198417663574, + "logps/chosen": -4.457161903381348, + "logps/rejected": -5.8368024826049805, + "loss": 0.436, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.457161903381348, + "rewards/margins": 1.3796398639678955, + "rewards/rejected": -5.8368024826049805, + "sft_loss": 4.56962251663208, + "step": 5585 + }, + { + "epoch": 2.9918046496069577, + "grad_norm": 19.680737690718, + "learning_rate": 1.901582343555308e-11, + "logits/chosen": -0.18700532615184784, + "logits/rejected": -0.10863462835550308, + "logps/chosen": -4.646914005279541, + "logps/rejected": -5.864001750946045, + "loss": 0.4952, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.646914005279541, + "rewards/margins": 1.2170881032943726, + "rewards/rejected": -5.864001750946045, + "sft_loss": 4.645427703857422, + "step": 5590 + }, + { + "epoch": 2.9944806823883594, + "grad_norm": 24.903963415968907, + "learning_rate": 7.858609320232634e-12, + "logits/chosen": -0.21200552582740784, + "logits/rejected": -0.06936580687761307, + "logps/chosen": -4.589808464050293, + "logps/rejected": -5.938838005065918, + "loss": 0.4508, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.589808464050293, + "rewards/margins": 1.349029302597046, + "rewards/rejected": -5.938838005065918, + "sft_loss": 4.675601959228516, + "step": 5595 + }, + { + "epoch": 2.9971567151697607, + "grad_norm": 21.750718778937895, + "learning_rate": 1.5523211535639624e-12, + "logits/chosen": -0.20201346278190613, + "logits/rejected": -0.10818527638912201, + "logps/chosen": -4.553851127624512, + "logps/rejected": -6.130950927734375, + "loss": 0.4531, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.553851127624512, + "rewards/margins": 1.5770998001098633, + "rewards/rejected": -6.130950927734375, + "sft_loss": 4.525333881378174, + "step": 5600 + }, + { + "epoch": 2.9971567151697607, + "eval_logits/chosen": 0.01564759947359562, + "eval_logits/rejected": 0.09269643574953079, + "eval_logps/chosen": -4.668255805969238, + "eval_logps/rejected": -5.721538066864014, + "eval_loss": 0.5805554389953613, + "eval_rewards/accuracies": 0.7292284965515137, + "eval_rewards/chosen": -4.668255805969238, + "eval_rewards/margins": 1.0532816648483276, + "eval_rewards/rejected": -5.721538066864014, + "eval_runtime": 42.9693, + "eval_samples_per_second": 31.301, + "eval_sft_loss": 4.639232158660889, + "eval_steps_per_second": 7.843, + "step": 5600 + }, + { + "epoch": 2.999297541394882, + "step": 5604, + "total_flos": 0.0, + "train_loss": 0.5697717436673725, + "train_runtime": 31532.5259, + "train_samples_per_second": 5.688, + "train_steps_per_second": 0.178 + } + ], + "logging_steps": 5, + "max_steps": 5604, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}