{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982631930527722, "eval_steps": 400, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01068804275217101, "grad_norm": 48.08801872083703, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -1.0088651180267334, "logits/rejected": -0.9796514511108398, "logps/chosen": -0.27404463291168213, "logps/rejected": -0.2714807987213135, "loss": 3.0862, "rewards/accuracies": 0.4375, "rewards/chosen": -2.7404463291168213, "rewards/margins": -0.025638360530138016, "rewards/rejected": -2.7148079872131348, "step": 5 }, { "epoch": 0.02137608550434202, "grad_norm": 36.32677728831924, "learning_rate": 2.127659574468085e-07, "logits/chosen": -1.0381072759628296, "logits/rejected": -0.9718279838562012, "logps/chosen": -0.2941068708896637, "logps/rejected": -0.29950597882270813, "loss": 3.0438, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.9410688877105713, "rewards/margins": 0.05399109050631523, "rewards/rejected": -2.9950597286224365, "step": 10 }, { "epoch": 0.03206412825651302, "grad_norm": 49.78322945052157, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -0.9687350988388062, "logits/rejected": -0.9881516695022583, "logps/chosen": -0.2641535699367523, "logps/rejected": -0.3008641302585602, "loss": 3.0435, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.641535520553589, "rewards/margins": 0.367105633020401, "rewards/rejected": -3.008641481399536, "step": 15 }, { "epoch": 0.04275217100868404, "grad_norm": 82.31380797314208, "learning_rate": 4.25531914893617e-07, "logits/chosen": -0.9551814794540405, "logits/rejected": -0.9292391538619995, "logps/chosen": -0.27771705389022827, "logps/rejected": -0.29131630063056946, "loss": 2.9721, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.7771706581115723, "rewards/margins": 0.13599242269992828, "rewards/rejected": -2.913163185119629, "step": 20 }, { "epoch": 0.053440213760855046, "grad_norm": 53.235006398214004, "learning_rate": 5.319148936170212e-07, "logits/chosen": -1.0150171518325806, "logits/rejected": -0.9864410161972046, "logps/chosen": -0.27177074551582336, "logps/rejected": -0.2781718671321869, "loss": 3.0997, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.717707395553589, "rewards/margins": 0.06401108205318451, "rewards/rejected": -2.7817187309265137, "step": 25 }, { "epoch": 0.06412825651302605, "grad_norm": 48.20265041211247, "learning_rate": 6.382978723404255e-07, "logits/chosen": -0.9971543550491333, "logits/rejected": -0.9527053833007812, "logps/chosen": -0.2729942202568054, "logps/rejected": -0.2792928218841553, "loss": 2.934, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -2.7299423217773438, "rewards/margins": 0.06298607587814331, "rewards/rejected": -2.7929282188415527, "step": 30 }, { "epoch": 0.07481629926519706, "grad_norm": 61.462381482883586, "learning_rate": 7.446808510638297e-07, "logits/chosen": -1.0560848712921143, "logits/rejected": -0.9801801443099976, "logps/chosen": -0.2938428223133087, "logps/rejected": -0.31970539689064026, "loss": 2.9344, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.9384284019470215, "rewards/margins": 0.2586255371570587, "rewards/rejected": -3.1970536708831787, "step": 35 }, { "epoch": 0.08550434201736808, "grad_norm": 48.23709299735009, "learning_rate": 8.51063829787234e-07, "logits/chosen": -1.0155723094940186, "logits/rejected": -0.9708529710769653, "logps/chosen": -0.27932173013687134, "logps/rejected": -0.32081979513168335, "loss": 2.9377, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.793217182159424, "rewards/margins": 0.4149812161922455, "rewards/rejected": -3.208198070526123, "step": 40 }, { "epoch": 0.09619238476953908, "grad_norm": 41.61971926684603, "learning_rate": 9.574468085106384e-07, "logits/chosen": -1.041048288345337, "logits/rejected": -0.9983006715774536, "logps/chosen": -0.33150529861450195, "logps/rejected": -0.38151663541793823, "loss": 2.9992, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -3.3150532245635986, "rewards/margins": 0.5001133680343628, "rewards/rejected": -3.815166473388672, "step": 45 }, { "epoch": 0.10688042752171009, "grad_norm": 100.12864824952396, "learning_rate": 9.998741174712533e-07, "logits/chosen": -1.0379228591918945, "logits/rejected": -0.987024188041687, "logps/chosen": -0.332489937543869, "logps/rejected": -0.37106576561927795, "loss": 3.0501, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -3.324899673461914, "rewards/margins": 0.3857579827308655, "rewards/rejected": -3.7106575965881348, "step": 50 }, { "epoch": 0.11756847027388109, "grad_norm": 72.48994261626859, "learning_rate": 9.991050648838675e-07, "logits/chosen": -1.0559552907943726, "logits/rejected": -1.0201071500778198, "logps/chosen": -0.28731420636177063, "logps/rejected": -0.3468899130821228, "loss": 2.8141, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.8731420040130615, "rewards/margins": 0.5957568883895874, "rewards/rejected": -3.4688987731933594, "step": 55 }, { "epoch": 0.1282565130260521, "grad_norm": 61.901731832713274, "learning_rate": 9.97637968732563e-07, "logits/chosen": -1.0929205417633057, "logits/rejected": -1.0595076084136963, "logps/chosen": -0.32101407647132874, "logps/rejected": -0.3458675742149353, "loss": 2.9126, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -3.2101407051086426, "rewards/margins": 0.24853472411632538, "rewards/rejected": -3.4586753845214844, "step": 60 }, { "epoch": 0.13894455577822312, "grad_norm": 50.59676816630706, "learning_rate": 9.954748808839674e-07, "logits/chosen": -1.0052556991577148, "logits/rejected": -0.9761192202568054, "logps/chosen": -0.37314313650131226, "logps/rejected": -0.42295828461647034, "loss": 2.8395, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -3.731431484222412, "rewards/margins": 0.49815160036087036, "rewards/rejected": -4.229582786560059, "step": 65 }, { "epoch": 0.14963259853039412, "grad_norm": 36.25325647717933, "learning_rate": 9.926188266120295e-07, "logits/chosen": -1.0206578969955444, "logits/rejected": -0.9958024024963379, "logps/chosen": -0.34903720021247864, "logps/rejected": -0.42112284898757935, "loss": 2.8941, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -3.4903717041015625, "rewards/margins": 0.7208563685417175, "rewards/rejected": -4.211228370666504, "step": 70 }, { "epoch": 0.16032064128256512, "grad_norm": 58.31771658432584, "learning_rate": 9.890738003669027e-07, "logits/chosen": -0.9748294949531555, "logits/rejected": -0.9043160676956177, "logps/chosen": -0.35313880443573, "logps/rejected": -0.4008881151676178, "loss": 2.8612, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -3.5313880443573, "rewards/margins": 0.4774929881095886, "rewards/rejected": -4.008881092071533, "step": 75 }, { "epoch": 0.17100868403473615, "grad_norm": 44.196456861980444, "learning_rate": 9.848447601883433e-07, "logits/chosen": -0.952348530292511, "logits/rejected": -0.9403513073921204, "logps/chosen": -0.3478546738624573, "logps/rejected": -0.4435739517211914, "loss": 2.8169, "rewards/accuracies": 0.59375, "rewards/chosen": -3.4785468578338623, "rewards/margins": 0.957192599773407, "rewards/rejected": -4.435739517211914, "step": 80 }, { "epoch": 0.18169672678690715, "grad_norm": 53.25106725855234, "learning_rate": 9.799376207714444e-07, "logits/chosen": -0.9578202962875366, "logits/rejected": -0.9359887838363647, "logps/chosen": -0.33914855122566223, "logps/rejected": -0.39435747265815735, "loss": 2.7017, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -3.3914856910705566, "rewards/margins": 0.5520890951156616, "rewards/rejected": -3.9435744285583496, "step": 85 }, { "epoch": 0.19238476953907815, "grad_norm": 62.756361118688226, "learning_rate": 9.743592451943998e-07, "logits/chosen": -1.0025393962860107, "logits/rejected": -0.9687566757202148, "logps/chosen": -0.4075719714164734, "logps/rejected": -0.4891131818294525, "loss": 2.9058, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.075719356536865, "rewards/margins": 0.815412163734436, "rewards/rejected": -4.89113187789917, "step": 90 }, { "epoch": 0.20307281229124916, "grad_norm": 58.1782532242724, "learning_rate": 9.681174353198686e-07, "logits/chosen": -1.0763788223266602, "logits/rejected": -0.9944006204605103, "logps/chosen": -0.4433771073818207, "logps/rejected": -0.4820137917995453, "loss": 2.801, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -4.433770179748535, "rewards/margins": 0.38636699318885803, "rewards/rejected": -4.820137977600098, "step": 95 }, { "epoch": 0.21376085504342018, "grad_norm": 80.39391041051806, "learning_rate": 9.612209208833646e-07, "logits/chosen": -0.9918880462646484, "logits/rejected": -0.9666227102279663, "logps/chosen": -0.4386115074157715, "logps/rejected": -0.49519771337509155, "loss": 2.8678, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.386115074157715, "rewards/margins": 0.565862238407135, "rewards/rejected": -4.951976776123047, "step": 100 }, { "epoch": 0.22444889779559118, "grad_norm": 65.63401286377506, "learning_rate": 9.536793472839324e-07, "logits/chosen": -0.9991065263748169, "logits/rejected": -0.9451557993888855, "logps/chosen": -0.41105857491493225, "logps/rejected": -0.5156455636024475, "loss": 2.7833, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.110585689544678, "rewards/margins": 1.0458701848983765, "rewards/rejected": -5.156455039978027, "step": 105 }, { "epoch": 0.23513694054776219, "grad_norm": 55.69064157962758, "learning_rate": 9.455032620941839e-07, "logits/chosen": -0.941478431224823, "logits/rejected": -0.882702648639679, "logps/chosen": -0.47625547647476196, "logps/rejected": -0.5961230397224426, "loss": 2.7006, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.76255464553833, "rewards/margins": 1.1986756324768066, "rewards/rejected": -5.9612298011779785, "step": 110 }, { "epoch": 0.2458249832999332, "grad_norm": 53.605857092749346, "learning_rate": 9.367041003085648e-07, "logits/chosen": -1.0106314420700073, "logits/rejected": -0.9506574869155884, "logps/chosen": -0.5101800560951233, "logps/rejected": -0.5759706497192383, "loss": 2.5851, "rewards/accuracies": 0.625, "rewards/chosen": -5.101800441741943, "rewards/margins": 0.6579058766365051, "rewards/rejected": -5.759706497192383, "step": 115 }, { "epoch": 0.2565130260521042, "grad_norm": 61.05548036729825, "learning_rate": 9.272941683504808e-07, "logits/chosen": -0.9638258814811707, "logits/rejected": -0.8746799230575562, "logps/chosen": -0.5256550908088684, "logps/rejected": -0.7152215838432312, "loss": 2.4383, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -5.2565507888793945, "rewards/margins": 1.895665168762207, "rewards/rejected": -7.15221643447876, "step": 120 }, { "epoch": 0.26720106880427524, "grad_norm": 55.61903002549707, "learning_rate": 9.172866268606513e-07, "logits/chosen": -1.048182725906372, "logits/rejected": -1.0052070617675781, "logps/chosen": -0.5960938334465027, "logps/rejected": -0.6898385286331177, "loss": 2.376, "rewards/accuracies": 0.6875, "rewards/chosen": -5.960938453674316, "rewards/margins": 0.9374464154243469, "rewards/rejected": -6.898385524749756, "step": 125 }, { "epoch": 0.27788911155644624, "grad_norm": 74.68235139365107, "learning_rate": 9.066954722907638e-07, "logits/chosen": -1.062281847000122, "logits/rejected": -1.0553070306777954, "logps/chosen": -0.5950319170951843, "logps/rejected": -0.8419907689094543, "loss": 2.1897, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -5.950318813323975, "rewards/margins": 2.469588279724121, "rewards/rejected": -8.419907569885254, "step": 130 }, { "epoch": 0.28857715430861725, "grad_norm": 65.79785138861331, "learning_rate": 8.955355173281707e-07, "logits/chosen": -1.0449963808059692, "logits/rejected": -0.99858558177948, "logps/chosen": -0.7000755071640015, "logps/rejected": -0.8521116375923157, "loss": 2.1703, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -7.000756740570068, "rewards/margins": 1.5203603506088257, "rewards/rejected": -8.521116256713867, "step": 135 }, { "epoch": 0.29926519706078825, "grad_norm": 77.69749767075221, "learning_rate": 8.838223701790055e-07, "logits/chosen": -1.1070436239242554, "logits/rejected": -1.0852067470550537, "logps/chosen": -0.8452858924865723, "logps/rejected": -0.9862743616104126, "loss": 2.1772, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.452858924865723, "rewards/margins": 1.4098844528198242, "rewards/rejected": -9.862743377685547, "step": 140 }, { "epoch": 0.30995323981295925, "grad_norm": 80.47564023125216, "learning_rate": 8.71572412738697e-07, "logits/chosen": -1.0120787620544434, "logits/rejected": -0.9871314167976379, "logps/chosen": -0.8392161130905151, "logps/rejected": -1.0649665594100952, "loss": 2.0262, "rewards/accuracies": 0.75, "rewards/chosen": -8.392160415649414, "rewards/margins": 2.257505416870117, "rewards/rejected": -10.649667739868164, "step": 145 }, { "epoch": 0.32064128256513025, "grad_norm": 73.58812460393091, "learning_rate": 8.588027776804058e-07, "logits/chosen": -1.0465233325958252, "logits/rejected": -1.0288817882537842, "logps/chosen": -0.9250057339668274, "logps/rejected": -1.1665313243865967, "loss": 2.0023, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.250056266784668, "rewards/margins": 2.4152560234069824, "rewards/rejected": -11.665312767028809, "step": 150 }, { "epoch": 0.33132932531730125, "grad_norm": 71.25828093855627, "learning_rate": 8.455313244934324e-07, "logits/chosen": -1.0641199350357056, "logits/rejected": -1.043682336807251, "logps/chosen": -0.9912335276603699, "logps/rejected": -1.2935209274291992, "loss": 2.0835, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -9.912336349487305, "rewards/margins": 3.0228726863861084, "rewards/rejected": -12.935208320617676, "step": 155 }, { "epoch": 0.3420173680694723, "grad_norm": 79.6477068661602, "learning_rate": 8.317766145051057e-07, "logits/chosen": -1.075276255607605, "logits/rejected": -1.0590044260025024, "logps/chosen": -1.1134541034698486, "logps/rejected": -1.501103162765503, "loss": 2.0148, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -11.134541511535645, "rewards/margins": 3.8764891624450684, "rewards/rejected": -15.011030197143555, "step": 160 }, { "epoch": 0.3527054108216433, "grad_norm": 56.59766760070266, "learning_rate": 8.175578849210894e-07, "logits/chosen": -1.087802529335022, "logits/rejected": -1.0643970966339111, "logps/chosen": -1.0862282514572144, "logps/rejected": -1.4578862190246582, "loss": 1.8755, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -10.862282752990723, "rewards/margins": 3.716578960418701, "rewards/rejected": -14.578862190246582, "step": 165 }, { "epoch": 0.3633934535738143, "grad_norm": 80.63138234673713, "learning_rate": 8.028950219204099e-07, "logits/chosen": -1.0851577520370483, "logits/rejected": -1.0633080005645752, "logps/chosen": -1.0777294635772705, "logps/rejected": -1.4586127996444702, "loss": 1.8428, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -10.777295112609863, "rewards/margins": 3.808833599090576, "rewards/rejected": -14.586128234863281, "step": 170 }, { "epoch": 0.3740814963259853, "grad_norm": 85.48434803985849, "learning_rate": 7.878085328428368e-07, "logits/chosen": -1.1011741161346436, "logits/rejected": -1.0544461011886597, "logps/chosen": -1.1464722156524658, "logps/rejected": -1.4175713062286377, "loss": 1.7044, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -11.4647216796875, "rewards/margins": 2.710991144180298, "rewards/rejected": -14.175712585449219, "step": 175 }, { "epoch": 0.3847695390781563, "grad_norm": 69.40055786476466, "learning_rate": 7.723195175075135e-07, "logits/chosen": -1.0559461116790771, "logits/rejected": -1.0371758937835693, "logps/chosen": -1.1131945848464966, "logps/rejected": -1.4650800228118896, "loss": 1.6498, "rewards/accuracies": 0.75, "rewards/chosen": -11.131945610046387, "rewards/margins": 3.518855333328247, "rewards/rejected": -14.650799751281738, "step": 180 }, { "epoch": 0.3954575818303273, "grad_norm": 98.45060400089052, "learning_rate": 7.564496387029531e-07, "logits/chosen": -1.0995948314666748, "logits/rejected": -1.0464978218078613, "logps/chosen": -1.1343258619308472, "logps/rejected": -1.5403584241867065, "loss": 1.707, "rewards/accuracies": 0.84375, "rewards/chosen": -11.343259811401367, "rewards/margins": 4.06032657623291, "rewards/rejected": -15.403585433959961, "step": 185 }, { "epoch": 0.4061456245824983, "grad_norm": 81.01007603467696, "learning_rate": 7.402210918896689e-07, "logits/chosen": -1.0919989347457886, "logits/rejected": -1.100353479385376, "logps/chosen": -1.233001470565796, "logps/rejected": -1.700783371925354, "loss": 1.5602, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -12.330015182495117, "rewards/margins": 4.677817344665527, "rewards/rejected": -17.00783348083496, "step": 190 }, { "epoch": 0.4168336673346693, "grad_norm": 65.85012462833096, "learning_rate": 7.236565741578162e-07, "logits/chosen": -1.0563186407089233, "logits/rejected": -1.0381094217300415, "logps/chosen": -1.264200210571289, "logps/rejected": -1.6215693950653076, "loss": 1.6265, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -12.642003059387207, "rewards/margins": 3.5736896991729736, "rewards/rejected": -16.2156925201416, "step": 195 }, { "epoch": 0.42752171008684037, "grad_norm": 95.1474104076758, "learning_rate": 7.067792524832603e-07, "logits/chosen": -1.0285234451293945, "logits/rejected": -1.0193403959274292, "logps/chosen": -1.3396581411361694, "logps/rejected": -1.7659127712249756, "loss": 1.5787, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -13.396580696105957, "rewards/margins": 4.262548923492432, "rewards/rejected": -17.659130096435547, "step": 200 }, { "epoch": 0.43820975283901137, "grad_norm": 85.84249896936906, "learning_rate": 6.896127313264642e-07, "logits/chosen": -1.0812981128692627, "logits/rejected": -1.0329921245574951, "logps/chosen": -1.421687364578247, "logps/rejected": -1.8678224086761475, "loss": 1.6933, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -14.216875076293945, "rewards/margins": 4.461349964141846, "rewards/rejected": -18.678224563598633, "step": 205 }, { "epoch": 0.44889779559118237, "grad_norm": 82.79707797243407, "learning_rate": 6.721810196195174e-07, "logits/chosen": -1.111215353012085, "logits/rejected": -1.1012616157531738, "logps/chosen": -1.4283300638198853, "logps/rejected": -1.8404741287231445, "loss": 1.6743, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -14.283300399780273, "rewards/margins": 4.121440410614014, "rewards/rejected": -18.404741287231445, "step": 210 }, { "epoch": 0.45958583834335337, "grad_norm": 96.36132500218224, "learning_rate": 6.545084971874736e-07, "logits/chosen": -1.0573780536651611, "logits/rejected": -1.042012333869934, "logps/chosen": -1.407849669456482, "logps/rejected": -1.8703521490097046, "loss": 1.537, "rewards/accuracies": 0.8125, "rewards/chosen": -14.078496932983398, "rewards/margins": 4.625025749206543, "rewards/rejected": -18.703523635864258, "step": 215 }, { "epoch": 0.47027388109552437, "grad_norm": 90.36166278660677, "learning_rate": 6.3661988065096e-07, "logits/chosen": -1.1174544095993042, "logits/rejected": -1.101723551750183, "logps/chosen": -1.4890402555465698, "logps/rejected": -1.957594633102417, "loss": 1.4701, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -14.890401840209961, "rewards/margins": 4.685546398162842, "rewards/rejected": -19.575946807861328, "step": 220 }, { "epoch": 0.48096192384769537, "grad_norm": 68.44788481379787, "learning_rate": 6.185401888577487e-07, "logits/chosen": -1.0951309204101562, "logits/rejected": -1.0667812824249268, "logps/chosen": -1.5165177583694458, "logps/rejected": -1.9649711847305298, "loss": 1.4516, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -15.165176391601562, "rewards/margins": 4.484532356262207, "rewards/rejected": -19.649709701538086, "step": 225 }, { "epoch": 0.4916499665998664, "grad_norm": 80.50989326225354, "learning_rate": 6.002947078916364e-07, "logits/chosen": -1.1788781881332397, "logits/rejected": -1.1332446336746216, "logps/chosen": -1.4616793394088745, "logps/rejected": -1.900464415550232, "loss": 1.4113, "rewards/accuracies": 0.78125, "rewards/chosen": -14.616796493530273, "rewards/margins": 4.387850761413574, "rewards/rejected": -19.00464630126953, "step": 230 }, { "epoch": 0.5023380093520374, "grad_norm": 77.76442108702383, "learning_rate": 5.819089557075688e-07, "logits/chosen": -1.2070626020431519, "logits/rejected": -1.1811145544052124, "logps/chosen": -1.488283395767212, "logps/rejected": -1.9684479236602783, "loss": 1.3956, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -14.882832527160645, "rewards/margins": 4.801646709442139, "rewards/rejected": -19.684478759765625, "step": 235 }, { "epoch": 0.5130260521042084, "grad_norm": 80.31662720738808, "learning_rate": 5.634086464424742e-07, "logits/chosen": -1.1846423149108887, "logits/rejected": -1.1862051486968994, "logps/chosen": -1.4058133363723755, "logps/rejected": -1.8912986516952515, "loss": 1.4273, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.058133125305176, "rewards/margins": 4.854854106903076, "rewards/rejected": -18.912986755371094, "step": 240 }, { "epoch": 0.5237140948563794, "grad_norm": 91.13175646088759, "learning_rate": 5.448196544517167e-07, "logits/chosen": -1.284184217453003, "logits/rejected": -1.2297312021255493, "logps/chosen": -1.4828532934188843, "logps/rejected": -2.057310104370117, "loss": 1.3937, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -14.828534126281738, "rewards/margins": 5.744570255279541, "rewards/rejected": -20.573101043701172, "step": 245 }, { "epoch": 0.5344021376085505, "grad_norm": 123.53088607985745, "learning_rate": 5.26167978121472e-07, "logits/chosen": -1.2075603008270264, "logits/rejected": -1.1929789781570435, "logps/chosen": -1.5853002071380615, "logps/rejected": -2.1607205867767334, "loss": 1.3189, "rewards/accuracies": 0.84375, "rewards/chosen": -15.853002548217773, "rewards/margins": 5.754203796386719, "rewards/rejected": -21.60720443725586, "step": 250 }, { "epoch": 0.5450901803607214, "grad_norm": 71.28630445206376, "learning_rate": 5.074797035076318e-07, "logits/chosen": -1.2610571384429932, "logits/rejected": -1.2377506494522095, "logps/chosen": -1.702081322669983, "logps/rejected": -2.176151990890503, "loss": 1.4628, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -17.020811080932617, "rewards/margins": 4.740708351135254, "rewards/rejected": -21.761520385742188, "step": 255 }, { "epoch": 0.5557782231128925, "grad_norm": 83.40791662734144, "learning_rate": 4.887809678520975e-07, "logits/chosen": -1.216742753982544, "logits/rejected": -1.1893587112426758, "logps/chosen": -1.6400012969970703, "logps/rejected": -2.1335091590881348, "loss": 1.4187, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -16.400012969970703, "rewards/margins": 4.935075283050537, "rewards/rejected": -21.3350887298584, "step": 260 }, { "epoch": 0.5664662658650634, "grad_norm": 86.00167726466505, "learning_rate": 4.700979230274829e-07, "logits/chosen": -1.1840455532073975, "logits/rejected": -1.1674549579620361, "logps/chosen": -1.7272002696990967, "logps/rejected": -2.246192216873169, "loss": 1.3947, "rewards/accuracies": 0.8125, "rewards/chosen": -17.27199935913086, "rewards/margins": 5.189921855926514, "rewards/rejected": -22.46192169189453, "step": 265 }, { "epoch": 0.5771543086172345, "grad_norm": 132.86583814774934, "learning_rate": 4.514566989613559e-07, "logits/chosen": -1.1881189346313477, "logits/rejected": -1.1602287292480469, "logps/chosen": -1.5204384326934814, "logps/rejected": -2.053271770477295, "loss": 1.3507, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.204385757446289, "rewards/margins": 5.328333854675293, "rewards/rejected": -20.532718658447266, "step": 270 }, { "epoch": 0.5878423513694054, "grad_norm": 77.55825472401709, "learning_rate": 4.328833670911724e-07, "logits/chosen": -1.1466959714889526, "logits/rejected": -1.1136794090270996, "logps/chosen": -1.5034624338150024, "logps/rejected": -1.962316870689392, "loss": 1.4798, "rewards/accuracies": 0.8125, "rewards/chosen": -15.034624099731445, "rewards/margins": 4.5885443687438965, "rewards/rejected": -19.6231689453125, "step": 275 }, { "epoch": 0.5985303941215765, "grad_norm": 101.24977397004761, "learning_rate": 4.144039039010124e-07, "logits/chosen": -1.2359565496444702, "logits/rejected": -1.2119746208190918, "logps/chosen": -1.5608460903167725, "logps/rejected": -2.0915093421936035, "loss": 1.3835, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -15.6084623336792, "rewards/margins": 5.306631565093994, "rewards/rejected": -20.91509246826172, "step": 280 }, { "epoch": 0.6092184368737475, "grad_norm": 98.11064576124802, "learning_rate": 3.960441545911204e-07, "logits/chosen": -1.1984508037567139, "logits/rejected": -1.1687726974487305, "logps/chosen": -1.568647861480713, "logps/rejected": -2.1477620601654053, "loss": 1.1256, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -15.686477661132812, "rewards/margins": 5.791141510009766, "rewards/rejected": -21.47762107849121, "step": 285 }, { "epoch": 0.6199064796259185, "grad_norm": 78.12499548715576, "learning_rate": 3.778297969310529e-07, "logits/chosen": -1.2200652360916138, "logits/rejected": -1.1771905422210693, "logps/chosen": -1.6141189336776733, "logps/rejected": -2.081244707107544, "loss": 1.4161, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -16.141189575195312, "rewards/margins": 4.671259880065918, "rewards/rejected": -20.812448501586914, "step": 290 }, { "epoch": 0.6305945223780896, "grad_norm": 95.75556658939907, "learning_rate": 3.5978630534699865e-07, "logits/chosen": -1.1571695804595947, "logits/rejected": -1.1426270008087158, "logps/chosen": -1.6118923425674438, "logps/rejected": -2.1269898414611816, "loss": 1.1696, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -16.11892318725586, "rewards/margins": 5.150975704193115, "rewards/rejected": -21.2698974609375, "step": 295 }, { "epoch": 0.6412825651302605, "grad_norm": 77.69963195203665, "learning_rate": 3.4193891529348795e-07, "logits/chosen": -1.0948816537857056, "logits/rejected": -1.0683825016021729, "logps/chosen": -1.7446101903915405, "logps/rejected": -2.2061550617218018, "loss": 1.6109, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -17.446102142333984, "rewards/margins": 4.615448951721191, "rewards/rejected": -22.06155014038086, "step": 300 }, { "epoch": 0.6519706078824316, "grad_norm": 78.33892667874268, "learning_rate": 3.243125879593286e-07, "logits/chosen": -1.1981477737426758, "logits/rejected": -1.155051589012146, "logps/chosen": -1.7163928747177124, "logps/rejected": -2.1944797039031982, "loss": 1.2958, "rewards/accuracies": 0.8125, "rewards/chosen": -17.163930892944336, "rewards/margins": 4.780867576599121, "rewards/rejected": -21.94479751586914, "step": 305 }, { "epoch": 0.6626586506346025, "grad_norm": 89.88604148049505, "learning_rate": 3.069319753571269e-07, "logits/chosen": -1.2228304147720337, "logits/rejected": -1.203338623046875, "logps/chosen": -1.725542664527893, "logps/rejected": -2.2549095153808594, "loss": 1.4162, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -17.255422592163086, "rewards/margins": 5.293669700622559, "rewards/rejected": -22.549095153808594, "step": 310 }, { "epoch": 0.6733466933867736, "grad_norm": 93.65154390265553, "learning_rate": 2.898213858452173e-07, "logits/chosen": -1.2166553735733032, "logits/rejected": -1.1637303829193115, "logps/chosen": -1.7361829280853271, "logps/rejected": -2.2571377754211426, "loss": 1.4039, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -17.361827850341797, "rewards/margins": 5.209545612335205, "rewards/rejected": -22.571374893188477, "step": 315 }, { "epoch": 0.6840347361389446, "grad_norm": 91.21799281847946, "learning_rate": 2.730047501302266e-07, "logits/chosen": -1.2119382619857788, "logits/rejected": -1.208423376083374, "logps/chosen": -1.7504520416259766, "logps/rejected": -2.366504669189453, "loss": 1.3127, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -17.504520416259766, "rewards/margins": 6.1605224609375, "rewards/rejected": -23.6650447845459, "step": 320 }, { "epoch": 0.6947227788911156, "grad_norm": 63.56175150661933, "learning_rate": 2.5650558779781635e-07, "logits/chosen": -1.2254472970962524, "logits/rejected": -1.1779931783676147, "logps/chosen": -1.7673594951629639, "logps/rejected": -2.4444289207458496, "loss": 1.3033, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -17.673593521118164, "rewards/margins": 6.770694732666016, "rewards/rejected": -24.444290161132812, "step": 325 }, { "epoch": 0.7054108216432866, "grad_norm": 72.1905404472246, "learning_rate": 2.403469744184154e-07, "logits/chosen": -1.140060305595398, "logits/rejected": -1.0987244844436646, "logps/chosen": -1.770612359046936, "logps/rejected": -2.268968105316162, "loss": 1.3575, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -17.70612335205078, "rewards/margins": 4.983555793762207, "rewards/rejected": -22.689678192138672, "step": 330 }, { "epoch": 0.7160988643954576, "grad_norm": 101.33406078916167, "learning_rate": 2.2455150927394878e-07, "logits/chosen": -1.1706708669662476, "logits/rejected": -1.153840184211731, "logps/chosen": -1.7666661739349365, "logps/rejected": -2.324845314025879, "loss": 1.2353, "rewards/accuracies": 0.84375, "rewards/chosen": -17.666664123535156, "rewards/margins": 5.581789970397949, "rewards/rejected": -23.248455047607422, "step": 335 }, { "epoch": 0.7267869071476286, "grad_norm": 100.14920380457131, "learning_rate": 2.0914128375069722e-07, "logits/chosen": -1.1839454174041748, "logits/rejected": -1.1537086963653564, "logps/chosen": -1.6535180807113647, "logps/rejected": -2.219202756881714, "loss": 1.3345, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -16.535181045532227, "rewards/margins": 5.656845569610596, "rewards/rejected": -22.192026138305664, "step": 340 }, { "epoch": 0.7374749498997996, "grad_norm": 82.946108666922, "learning_rate": 1.9413785044249676e-07, "logits/chosen": -1.2198922634124756, "logits/rejected": -1.197533369064331, "logps/chosen": -1.7453501224517822, "logps/rejected": -2.3954808712005615, "loss": 1.3989, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -17.4534969329834, "rewards/margins": 6.501312255859375, "rewards/rejected": -23.95481300354004, "step": 345 }, { "epoch": 0.7481629926519706, "grad_norm": 123.61164574835428, "learning_rate": 1.7956219300748792e-07, "logits/chosen": -1.2053463459014893, "logits/rejected": -1.2073911428451538, "logps/chosen": -1.5878703594207764, "logps/rejected": -2.1133296489715576, "loss": 1.288, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -15.878703117370605, "rewards/margins": 5.254591941833496, "rewards/rejected": -21.133296966552734, "step": 350 }, { "epoch": 0.7588510354041417, "grad_norm": 72.79372985900129, "learning_rate": 1.6543469682057104e-07, "logits/chosen": -1.1358482837677002, "logits/rejected": -1.1486907005310059, "logps/chosen": -1.6166797876358032, "logps/rejected": -2.18247389793396, "loss": 1.178, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -16.166797637939453, "rewards/margins": 5.657941818237305, "rewards/rejected": -21.824739456176758, "step": 355 }, { "epoch": 0.7695390781563126, "grad_norm": 78.754834231354, "learning_rate": 1.5177512046261666e-07, "logits/chosen": -1.1821458339691162, "logits/rejected": -1.1815634965896606, "logps/chosen": -1.6133620738983154, "logps/rejected": -2.2593438625335693, "loss": 1.3108, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -16.133623123168945, "rewards/margins": 6.459815979003906, "rewards/rejected": -22.59343719482422, "step": 360 }, { "epoch": 0.7802271209084837, "grad_norm": 73.98291665153468, "learning_rate": 1.3860256808630427e-07, "logits/chosen": -1.222730278968811, "logits/rejected": -1.1620246171951294, "logps/chosen": -1.6831239461898804, "logps/rejected": -2.3258934020996094, "loss": 1.283, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -16.831241607666016, "rewards/margins": 6.427696228027344, "rewards/rejected": -23.25893783569336, "step": 365 }, { "epoch": 0.7909151636606546, "grad_norm": 95.07266315983894, "learning_rate": 1.2593546269723647e-07, "logits/chosen": -1.151348352432251, "logits/rejected": -1.1408799886703491, "logps/chosen": -1.6504104137420654, "logps/rejected": -2.1494410037994385, "loss": 1.2917, "rewards/accuracies": 0.84375, "rewards/chosen": -16.504104614257812, "rewards/margins": 4.990302562713623, "rewards/rejected": -21.494407653808594, "step": 370 }, { "epoch": 0.8016032064128257, "grad_norm": 79.33408613898226, "learning_rate": 1.1379152038770029e-07, "logits/chosen": -1.196144938468933, "logits/rejected": -1.201220154762268, "logps/chosen": -1.7832542657852173, "logps/rejected": -2.3613197803497314, "loss": 1.2354, "rewards/accuracies": 0.84375, "rewards/chosen": -17.832544326782227, "rewards/margins": 5.78065299987793, "rewards/rejected": -23.613195419311523, "step": 375 }, { "epoch": 0.8122912491649966, "grad_norm": 128.43857212250256, "learning_rate": 1.0218772555910954e-07, "logits/chosen": -1.2059003114700317, "logits/rejected": -1.187628149986267, "logps/chosen": -1.6523897647857666, "logps/rejected": -2.1899101734161377, "loss": 1.3942, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -16.52389907836914, "rewards/margins": 5.3752055168151855, "rewards/rejected": -21.89910316467285, "step": 380 }, { "epoch": 0.8229792919171677, "grad_norm": 64.21604840691542, "learning_rate": 9.114030716778432e-08, "logits/chosen": -1.1978672742843628, "logits/rejected": -1.1761789321899414, "logps/chosen": -1.7003828287124634, "logps/rejected": -2.404451608657837, "loss": 1.11, "rewards/accuracies": 0.84375, "rewards/chosen": -17.003828048706055, "rewards/margins": 7.040687561035156, "rewards/rejected": -24.044517517089844, "step": 385 }, { "epoch": 0.8336673346693386, "grad_norm": 70.99597720485836, "learning_rate": 8.066471602728803e-08, "logits/chosen": -1.2044178247451782, "logits/rejected": -1.1908612251281738, "logps/chosen": -1.763636827468872, "logps/rejected": -2.362152576446533, "loss": 1.265, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -17.636367797851562, "rewards/margins": 5.985157489776611, "rewards/rejected": -23.621524810791016, "step": 390 }, { "epoch": 0.8443553774215097, "grad_norm": 97.52015838877179, "learning_rate": 7.077560319906694e-08, "logits/chosen": -1.2074767351150513, "logits/rejected": -1.1871702671051025, "logps/chosen": -1.6937364339828491, "logps/rejected": -2.2520108222961426, "loss": 1.2622, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -16.93736457824707, "rewards/margins": 5.5827436447143555, "rewards/rejected": -22.52010726928711, "step": 395 }, { "epoch": 0.8550434201736807, "grad_norm": 64.30358993655315, "learning_rate": 6.148679950161672e-08, "logits/chosen": -1.2058738470077515, "logits/rejected": -1.1902999877929688, "logps/chosen": -1.7209612131118774, "logps/rejected": -2.245680332183838, "loss": 1.2091, "rewards/accuracies": 0.8125, "rewards/chosen": -17.209611892700195, "rewards/margins": 5.247193336486816, "rewards/rejected": -22.456806182861328, "step": 400 }, { "epoch": 0.8550434201736807, "eval_logits/chosen": -1.3691134452819824, "eval_logits/rejected": -1.3789646625518799, "eval_logps/chosen": -1.7368168830871582, "eval_logps/rejected": -2.3018128871917725, "eval_loss": 1.2327845096588135, "eval_rewards/accuracies": 0.8414633870124817, "eval_rewards/chosen": -17.3681697845459, "eval_rewards/margins": 5.649959087371826, "eval_rewards/rejected": -23.018129348754883, "eval_runtime": 95.7406, "eval_samples_per_second": 20.482, "eval_steps_per_second": 1.285, "step": 400 }, { "epoch": 0.8657314629258517, "grad_norm": 102.26927527904057, "learning_rate": 5.2811296166831666e-08, "logits/chosen": -1.1695072650909424, "logits/rejected": -1.187680959701538, "logps/chosen": -1.8056495189666748, "logps/rejected": -2.3746306896209717, "loss": 1.227, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -18.05649185180664, "rewards/margins": 5.689815044403076, "rewards/rejected": -23.74631118774414, "step": 405 }, { "epoch": 0.8764195056780227, "grad_norm": 143.72613060267224, "learning_rate": 4.4761226670592066e-08, "logits/chosen": -1.197076439857483, "logits/rejected": -1.1834781169891357, "logps/chosen": -1.7283179759979248, "logps/rejected": -2.2808258533477783, "loss": 1.3386, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -17.283184051513672, "rewards/margins": 5.5250773429870605, "rewards/rejected": -22.808258056640625, "step": 410 }, { "epoch": 0.8871075484301937, "grad_norm": 71.89013668179118, "learning_rate": 3.734784976300165e-08, "logits/chosen": -1.1907715797424316, "logits/rejected": -1.140285611152649, "logps/chosen": -1.6651338338851929, "logps/rejected": -2.3171894550323486, "loss": 1.402, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -16.65134048461914, "rewards/margins": 6.520555019378662, "rewards/rejected": -23.171894073486328, "step": 415 }, { "epoch": 0.8977955911823647, "grad_norm": 76.39313147299151, "learning_rate": 3.058153372200695e-08, "logits/chosen": -1.2226186990737915, "logits/rejected": -1.174892783164978, "logps/chosen": -1.624670386314392, "logps/rejected": -2.2263433933258057, "loss": 1.2343, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -16.246702194213867, "rewards/margins": 6.016730308532715, "rewards/rejected": -22.2634334564209, "step": 420 }, { "epoch": 0.9084836339345357, "grad_norm": 97.62440223770292, "learning_rate": 2.4471741852423233e-08, "logits/chosen": -1.2083046436309814, "logits/rejected": -1.199594259262085, "logps/chosen": -1.8252136707305908, "logps/rejected": -2.3690743446350098, "loss": 1.449, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -18.25213623046875, "rewards/margins": 5.438607215881348, "rewards/rejected": -23.690744400024414, "step": 425 }, { "epoch": 0.9191716766867067, "grad_norm": 88.72134746678486, "learning_rate": 1.9027019250647036e-08, "logits/chosen": -1.203539490699768, "logits/rejected": -1.1891189813613892, "logps/chosen": -1.7990118265151978, "logps/rejected": -2.40352725982666, "loss": 1.2351, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -17.990116119384766, "rewards/margins": 6.0451555252075195, "rewards/rejected": -24.0352725982666, "step": 430 }, { "epoch": 0.9298597194388778, "grad_norm": 84.6823661942337, "learning_rate": 1.4254980853566246e-08, "logits/chosen": -1.1562435626983643, "logits/rejected": -1.119716763496399, "logps/chosen": -1.6462457180023193, "logps/rejected": -2.239661693572998, "loss": 1.219, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -16.46245765686035, "rewards/margins": 5.934161186218262, "rewards/rejected": -22.396617889404297, "step": 435 }, { "epoch": 0.9405477621910487, "grad_norm": 83.00612480405555, "learning_rate": 1.016230078838226e-08, "logits/chosen": -1.1879512071609497, "logits/rejected": -1.1317486763000488, "logps/chosen": -1.7986972332000732, "logps/rejected": -2.3512682914733887, "loss": 1.2595, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -17.98697280883789, "rewards/margins": 5.5257110595703125, "rewards/rejected": -23.512683868408203, "step": 440 }, { "epoch": 0.9512358049432198, "grad_norm": 71.06146659230785, "learning_rate": 6.754703038239329e-09, "logits/chosen": -1.1331255435943604, "logits/rejected": -1.1174695491790771, "logps/chosen": -1.7602777481079102, "logps/rejected": -2.4175848960876465, "loss": 1.1517, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -17.6027774810791, "rewards/margins": 6.573070526123047, "rewards/rejected": -24.17584800720215, "step": 445 }, { "epoch": 0.9619238476953907, "grad_norm": 104.27511874656412, "learning_rate": 4.036953436716895e-09, "logits/chosen": -1.2391235828399658, "logits/rejected": -1.2209500074386597, "logps/chosen": -1.7086238861083984, "logps/rejected": -2.2565104961395264, "loss": 1.3267, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -17.086238861083984, "rewards/margins": 5.4788665771484375, "rewards/rejected": -22.565105438232422, "step": 450 }, { "epoch": 0.9726118904475618, "grad_norm": 93.90841637927619, "learning_rate": 2.0128530023804656e-09, "logits/chosen": -1.2041445970535278, "logits/rejected": -1.170940637588501, "logps/chosen": -1.7216434478759766, "logps/rejected": -2.3828110694885254, "loss": 1.0634, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -17.216434478759766, "rewards/margins": 6.611675262451172, "rewards/rejected": -23.828107833862305, "step": 455 }, { "epoch": 0.9832999331997327, "grad_norm": 96.31609099374828, "learning_rate": 6.852326227130833e-10, "logits/chosen": -1.210766315460205, "logits/rejected": -1.1994924545288086, "logps/chosen": -1.80474054813385, "logps/rejected": -2.4195799827575684, "loss": 1.2129, "rewards/accuracies": 0.84375, "rewards/chosen": -18.047405242919922, "rewards/margins": 6.148395538330078, "rewards/rejected": -24.195802688598633, "step": 460 }, { "epoch": 0.9939879759519038, "grad_norm": 83.00770987189074, "learning_rate": 5.594909486328348e-11, "logits/chosen": -1.1916502714157104, "logits/rejected": -1.1978678703308105, "logps/chosen": -1.8223260641098022, "logps/rejected": -2.4590952396392822, "loss": 1.366, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -18.22325897216797, "rewards/margins": 6.367691993713379, "rewards/rejected": -24.590951919555664, "step": 465 }, { "epoch": 0.9982631930527722, "step": 467, "total_flos": 0.0, "train_loss": 1.8326820045773426, "train_runtime": 11496.4274, "train_samples_per_second": 5.208, "train_steps_per_second": 0.041 } ], "logging_steps": 5, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }