{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998424948810837, "eval_steps": 100, "global_step": 3174, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.322265625, "learning_rate": 1.5723270440251573e-08, "logits/chosen": -1.3876760005950928, "logits/rejected": -1.4584133625030518, "logps/chosen": -148.11717224121094, "logps/rejected": -197.28189086914062, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.474609375, "learning_rate": 1.5723270440251575e-07, "logits/chosen": -1.2968941926956177, "logits/rejected": -1.006857991218567, "logps/chosen": -190.5150604248047, "logps/rejected": -182.05677795410156, "loss": 0.6929, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": 0.0008159870631061494, "rewards/margins": 0.0014503882266581059, "rewards/margins_max": 0.0036827889271080494, "rewards/margins_min": -0.0007820128812454641, "rewards/margins_std": 0.003157091559842229, "rewards/rejected": -0.0006344011053442955, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.443359375, "learning_rate": 3.144654088050315e-07, "logits/chosen": -1.3660128116607666, "logits/rejected": -1.0527918338775635, "logps/chosen": -225.42782592773438, "logps/rejected": -200.10775756835938, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0007323303143493831, "rewards/margins": 0.0006205940153449774, "rewards/margins_max": 0.0026885834522545338, "rewards/margins_min": -0.0014473951887339354, "rewards/margins_std": 0.002924578730016947, "rewards/rejected": 0.0001117362771765329, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.431640625, "learning_rate": 4.716981132075472e-07, "logits/chosen": -1.2632051706314087, "logits/rejected": -0.9830085039138794, "logps/chosen": -180.41348266601562, "logps/rejected": -184.5921173095703, "loss": 0.6928, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0001449241244699806, "rewards/margins": 0.001389974495396018, "rewards/margins_max": 0.0028195646591484547, "rewards/margins_min": -3.9615635614609346e-05, "rewards/margins_std": 0.002021745778620243, "rewards/rejected": -0.001245050341822207, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.33984375, "learning_rate": 6.28930817610063e-07, "logits/chosen": -1.458979845046997, "logits/rejected": -1.1576180458068848, "logps/chosen": -225.39303588867188, "logps/rejected": -276.84759521484375, "loss": 0.6927, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0010745488107204437, "rewards/margins": 0.0014787310501560569, "rewards/margins_max": 0.0033444215077906847, "rewards/margins_min": -0.00038695911644026637, "rewards/margins_std": 0.0026384841185063124, "rewards/rejected": -0.000404182355850935, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.515625, "learning_rate": 7.861635220125787e-07, "logits/chosen": -1.367509126663208, "logits/rejected": -0.8635444641113281, "logps/chosen": -331.5768127441406, "logps/rejected": -205.92117309570312, "loss": 0.6921, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0009096821886487305, "rewards/margins": 0.001458239508792758, "rewards/margins_max": 0.0032467518467456102, "rewards/margins_min": -0.0003302727418486029, "rewards/margins_std": 0.0025293382350355387, "rewards/rejected": -0.0005485572619363666, "step": 50 }, { "epoch": 0.02, "grad_norm": 0.453125, "learning_rate": 9.433962264150944e-07, "logits/chosen": -1.1953330039978027, "logits/rejected": -1.012205958366394, "logps/chosen": -203.44515991210938, "logps/rejected": -264.71368408203125, "loss": 0.6919, "rewards/accuracies": 0.875, "rewards/chosen": 0.003049404127523303, "rewards/margins": 0.003112142439931631, "rewards/margins_max": 0.004920116625726223, "rewards/margins_min": 0.0013041686033830047, "rewards/margins_std": 0.0025568611454218626, "rewards/rejected": -6.273845065152273e-05, "step": 60 }, { "epoch": 0.02, "grad_norm": 0.4453125, "learning_rate": 1.1006289308176102e-06, "logits/chosen": -1.4164549112319946, "logits/rejected": -1.0944883823394775, "logps/chosen": -218.58349609375, "logps/rejected": -225.0066375732422, "loss": 0.6909, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0034692329354584217, "rewards/margins": 0.003931135404855013, "rewards/margins_max": 0.006383487489074469, "rewards/margins_min": 0.0014787826221436262, "rewards/margins_std": 0.003468149807304144, "rewards/rejected": -0.00046190261491574347, "step": 70 }, { "epoch": 0.03, "grad_norm": 0.51953125, "learning_rate": 1.257861635220126e-06, "logits/chosen": -1.273780345916748, "logits/rejected": -0.994576096534729, "logps/chosen": -284.58660888671875, "logps/rejected": -266.48455810546875, "loss": 0.6898, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0061839548870921135, "rewards/margins": 0.006205023266375065, "rewards/margins_max": 0.008886772207915783, "rewards/margins_min": 0.0035232752561569214, "rewards/margins_std": 0.0037925648503005505, "rewards/rejected": -2.1068564819870517e-05, "step": 80 }, { "epoch": 0.03, "grad_norm": 0.59375, "learning_rate": 1.4150943396226415e-06, "logits/chosen": -1.4617016315460205, "logits/rejected": -1.171229600906372, "logps/chosen": -211.6915283203125, "logps/rejected": -219.3947296142578, "loss": 0.6895, "rewards/accuracies": 0.875, "rewards/chosen": 0.005883140489459038, "rewards/margins": 0.008139796555042267, "rewards/margins_max": 0.012691095471382141, "rewards/margins_min": 0.0035884971730411053, "rewards/margins_std": 0.006436510477215052, "rewards/rejected": -0.0022566565312445164, "step": 90 }, { "epoch": 0.03, "grad_norm": 0.3359375, "learning_rate": 1.5723270440251573e-06, "logits/chosen": -1.4086381196975708, "logits/rejected": -0.9039764404296875, "logps/chosen": -256.61712646484375, "logps/rejected": -205.50588989257812, "loss": 0.6875, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.00955943949520588, "rewards/margins": 0.011472916230559349, "rewards/margins_max": 0.018265143036842346, "rewards/margins_min": 0.004680688492953777, "rewards/margins_std": 0.009605659171938896, "rewards/rejected": -0.0019134767353534698, "step": 100 }, { "epoch": 0.03, "grad_norm": 0.390625, "learning_rate": 1.7295597484276729e-06, "logits/chosen": -1.2530136108398438, "logits/rejected": -0.9792757034301758, "logps/chosen": -229.84579467773438, "logps/rejected": -190.1811065673828, "loss": 0.6862, "rewards/accuracies": 0.875, "rewards/chosen": 0.009367749094963074, "rewards/margins": 0.01274518109858036, "rewards/margins_max": 0.01925867795944214, "rewards/margins_min": 0.006231681443750858, "rewards/margins_std": 0.009211478754878044, "rewards/rejected": -0.00337742967531085, "step": 110 }, { "epoch": 0.04, "grad_norm": 0.44921875, "learning_rate": 1.8867924528301889e-06, "logits/chosen": -1.4178739786148071, "logits/rejected": -1.0519492626190186, "logps/chosen": -194.27273559570312, "logps/rejected": -186.27613830566406, "loss": 0.6861, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.01265544630587101, "rewards/margins": 0.014012080617249012, "rewards/margins_max": 0.020701151341199875, "rewards/margins_min": 0.007323009427636862, "rewards/margins_std": 0.009459775872528553, "rewards/rejected": -0.0013566340785473585, "step": 120 }, { "epoch": 0.04, "grad_norm": 0.578125, "learning_rate": 2.044025157232705e-06, "logits/chosen": -1.180983066558838, "logits/rejected": -0.9479328989982605, "logps/chosen": -218.3427734375, "logps/rejected": -249.2072296142578, "loss": 0.6837, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.013748427852988243, "rewards/margins": 0.01925661601126194, "rewards/margins_max": 0.027199331670999527, "rewards/margins_min": 0.011313898488879204, "rewards/margins_std": 0.011232697404921055, "rewards/rejected": -0.005508188158273697, "step": 130 }, { "epoch": 0.04, "grad_norm": 0.416015625, "learning_rate": 2.2012578616352204e-06, "logits/chosen": -1.189428448677063, "logits/rejected": -0.9659063220024109, "logps/chosen": -266.5777282714844, "logps/rejected": -217.1243438720703, "loss": 0.6828, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.012634982354938984, "rewards/margins": 0.019540421664714813, "rewards/margins_max": 0.028719156980514526, "rewards/margins_min": 0.0103616863489151, "rewards/margins_std": 0.012980693951249123, "rewards/rejected": -0.006905439309775829, "step": 140 }, { "epoch": 0.05, "grad_norm": 0.314453125, "learning_rate": 2.358490566037736e-06, "logits/chosen": -1.3495254516601562, "logits/rejected": -0.8819383382797241, "logps/chosen": -302.58099365234375, "logps/rejected": -235.6078643798828, "loss": 0.6779, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.025924265384674072, "rewards/margins": 0.03467489033937454, "rewards/margins_max": 0.04881124570965767, "rewards/margins_min": 0.02053852751851082, "rewards/margins_std": 0.019991829991340637, "rewards/rejected": -0.00875062309205532, "step": 150 }, { "epoch": 0.05, "grad_norm": 0.3515625, "learning_rate": 2.515723270440252e-06, "logits/chosen": -1.3711656332015991, "logits/rejected": -1.1784591674804688, "logps/chosen": -166.46961975097656, "logps/rejected": -229.8678436279297, "loss": 0.6772, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.023975688964128494, "rewards/margins": 0.03182069957256317, "rewards/margins_max": 0.047649938613176346, "rewards/margins_min": 0.015991469845175743, "rewards/margins_std": 0.022385913878679276, "rewards/rejected": -0.00784501526504755, "step": 160 }, { "epoch": 0.05, "grad_norm": 0.3359375, "learning_rate": 2.6729559748427675e-06, "logits/chosen": -1.2441256046295166, "logits/rejected": -0.9379784464836121, "logps/chosen": -218.8394775390625, "logps/rejected": -199.82815551757812, "loss": 0.6772, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.022251352667808533, "rewards/margins": 0.03415388613939285, "rewards/margins_max": 0.045127563178539276, "rewards/margins_min": 0.023180212825536728, "rewards/margins_std": 0.015519115142524242, "rewards/rejected": -0.01190253347158432, "step": 170 }, { "epoch": 0.06, "grad_norm": 0.578125, "learning_rate": 2.830188679245283e-06, "logits/chosen": -1.3883864879608154, "logits/rejected": -1.0542776584625244, "logps/chosen": -210.83740234375, "logps/rejected": -217.66787719726562, "loss": 0.6704, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.031140008941292763, "rewards/margins": 0.048059821128845215, "rewards/margins_max": 0.07035170495510101, "rewards/margins_min": 0.02576792798936367, "rewards/margins_std": 0.031525496393442154, "rewards/rejected": -0.016919810324907303, "step": 180 }, { "epoch": 0.06, "grad_norm": 0.435546875, "learning_rate": 2.987421383647799e-06, "logits/chosen": -1.2937629222869873, "logits/rejected": -1.0859944820404053, "logps/chosen": -193.8387451171875, "logps/rejected": -216.71841430664062, "loss": 0.6718, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.030585434287786484, "rewards/margins": 0.03754722326993942, "rewards/margins_max": 0.055405668914318085, "rewards/margins_min": 0.019688773900270462, "rewards/margins_std": 0.025255661457777023, "rewards/rejected": -0.00696178525686264, "step": 190 }, { "epoch": 0.06, "grad_norm": 0.435546875, "learning_rate": 3.1446540880503146e-06, "logits/chosen": -1.3293890953063965, "logits/rejected": -1.1202318668365479, "logps/chosen": -217.463623046875, "logps/rejected": -208.9364013671875, "loss": 0.67, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.017296183854341507, "rewards/margins": 0.037150561809539795, "rewards/margins_max": 0.05749017000198364, "rewards/margins_min": 0.016810955479741096, "rewards/margins_std": 0.028764545917510986, "rewards/rejected": -0.01985437609255314, "step": 200 }, { "epoch": 0.07, "grad_norm": 0.40234375, "learning_rate": 3.30188679245283e-06, "logits/chosen": -1.446415662765503, "logits/rejected": -1.1044580936431885, "logps/chosen": -234.07064819335938, "logps/rejected": -254.47964477539062, "loss": 0.6676, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0392422117292881, "rewards/margins": 0.05013582855463028, "rewards/margins_max": 0.07770033180713654, "rewards/margins_min": 0.022571321576833725, "rewards/margins_std": 0.038982093334198, "rewards/rejected": -0.01089361310005188, "step": 210 }, { "epoch": 0.07, "grad_norm": 0.33984375, "learning_rate": 3.4591194968553458e-06, "logits/chosen": -1.411773681640625, "logits/rejected": -1.1437511444091797, "logps/chosen": -272.9713439941406, "logps/rejected": -201.37228393554688, "loss": 0.6661, "rewards/accuracies": 1.0, "rewards/chosen": 0.02809012308716774, "rewards/margins": 0.05969276279211044, "rewards/margins_max": 0.08930987864732742, "rewards/margins_min": 0.03007565811276436, "rewards/margins_std": 0.04188491404056549, "rewards/rejected": -0.031602635979652405, "step": 220 }, { "epoch": 0.07, "grad_norm": 0.66796875, "learning_rate": 3.6163522012578618e-06, "logits/chosen": -1.4335668087005615, "logits/rejected": -1.1354362964630127, "logps/chosen": -249.6646270751953, "logps/rejected": -205.4978485107422, "loss": 0.6569, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04378952831029892, "rewards/margins": 0.08125929534435272, "rewards/margins_max": 0.11855404078960419, "rewards/margins_min": 0.04396456480026245, "rewards/margins_std": 0.052742719650268555, "rewards/rejected": -0.0374697707593441, "step": 230 }, { "epoch": 0.08, "grad_norm": 0.5234375, "learning_rate": 3.7735849056603777e-06, "logits/chosen": -1.422411561012268, "logits/rejected": -1.2927656173706055, "logps/chosen": -172.6126708984375, "logps/rejected": -264.3357849121094, "loss": 0.652, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04493634030222893, "rewards/margins": 0.08786293864250183, "rewards/margins_max": 0.12235504388809204, "rewards/margins_min": 0.053370825946331024, "rewards/margins_std": 0.0487792082130909, "rewards/rejected": -0.042926594614982605, "step": 240 }, { "epoch": 0.08, "grad_norm": 0.435546875, "learning_rate": 3.930817610062894e-06, "logits/chosen": -1.4506088495254517, "logits/rejected": -1.2010631561279297, "logps/chosen": -260.5227966308594, "logps/rejected": -221.35153198242188, "loss": 0.6519, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.052806317806243896, "rewards/margins": 0.09446804225444794, "rewards/margins_max": 0.13791503012180328, "rewards/margins_min": 0.05102104693651199, "rewards/margins_std": 0.061443328857421875, "rewards/rejected": -0.041661716997623444, "step": 250 }, { "epoch": 0.08, "grad_norm": 0.423828125, "learning_rate": 4.08805031446541e-06, "logits/chosen": -1.2909493446350098, "logits/rejected": -0.8009993433952332, "logps/chosen": -277.853271484375, "logps/rejected": -263.16009521484375, "loss": 0.646, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.06023542210459709, "rewards/margins": 0.1075894683599472, "rewards/margins_max": 0.14478802680969238, "rewards/margins_min": 0.07039090245962143, "rewards/margins_std": 0.05260671302676201, "rewards/rejected": -0.047354042530059814, "step": 260 }, { "epoch": 0.09, "grad_norm": 0.435546875, "learning_rate": 4.245283018867925e-06, "logits/chosen": -1.4090592861175537, "logits/rejected": -0.8552477955818176, "logps/chosen": -254.5354766845703, "logps/rejected": -248.50057983398438, "loss": 0.6336, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.06402402371168137, "rewards/margins": 0.1346406638622284, "rewards/margins_max": 0.18887916207313538, "rewards/margins_min": 0.08040215820074081, "rewards/margins_std": 0.07670482993125916, "rewards/rejected": -0.07061664760112762, "step": 270 }, { "epoch": 0.09, "grad_norm": 0.42578125, "learning_rate": 4.402515723270441e-06, "logits/chosen": -1.5802761316299438, "logits/rejected": -1.236020565032959, "logps/chosen": -191.21524047851562, "logps/rejected": -195.28176879882812, "loss": 0.646, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04768265783786774, "rewards/margins": 0.0977226123213768, "rewards/margins_max": 0.14488402009010315, "rewards/margins_min": 0.05056120082736015, "rewards/margins_std": 0.06669630855321884, "rewards/rejected": -0.050039954483509064, "step": 280 }, { "epoch": 0.09, "grad_norm": 0.35546875, "learning_rate": 4.559748427672957e-06, "logits/chosen": -1.5027745962142944, "logits/rejected": -1.0609099864959717, "logps/chosen": -237.45034790039062, "logps/rejected": -216.4964141845703, "loss": 0.6271, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05908682197332382, "rewards/margins": 0.14679358899593353, "rewards/margins_max": 0.21701796352863312, "rewards/margins_min": 0.07656919956207275, "rewards/margins_std": 0.09931226819753647, "rewards/rejected": -0.08770675212144852, "step": 290 }, { "epoch": 0.09, "grad_norm": 0.431640625, "learning_rate": 4.716981132075472e-06, "logits/chosen": -1.6774566173553467, "logits/rejected": -1.2361747026443481, "logps/chosen": -220.8132781982422, "logps/rejected": -220.8450469970703, "loss": 0.6144, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0774054005742073, "rewards/margins": 0.17351576685905457, "rewards/margins_max": 0.25278240442276, "rewards/margins_min": 0.09424915909767151, "rewards/margins_std": 0.11209992319345474, "rewards/rejected": -0.09611036628484726, "step": 300 }, { "epoch": 0.1, "grad_norm": 0.9453125, "learning_rate": 4.874213836477988e-06, "logits/chosen": -1.428739309310913, "logits/rejected": -1.1747839450836182, "logps/chosen": -194.78067016601562, "logps/rejected": -227.9475555419922, "loss": 0.6065, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.060627926141023636, "rewards/margins": 0.17353899776935577, "rewards/margins_max": 0.2618575692176819, "rewards/margins_min": 0.08522041887044907, "rewards/margins_std": 0.12490131705999374, "rewards/rejected": -0.11291106045246124, "step": 310 }, { "epoch": 0.1, "grad_norm": 0.6171875, "learning_rate": 4.999993950030735e-06, "logits/chosen": -1.4527971744537354, "logits/rejected": -1.1030164957046509, "logps/chosen": -248.6017608642578, "logps/rejected": -257.7789001464844, "loss": 0.597, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.023475894704461098, "rewards/margins": 0.2308409959077835, "rewards/margins_max": 0.37093719840049744, "rewards/margins_min": 0.09074478596448898, "rewards/margins_std": 0.1981259286403656, "rewards/rejected": -0.20736508071422577, "step": 320 }, { "epoch": 0.1, "grad_norm": 0.546875, "learning_rate": 4.999782204181027e-06, "logits/chosen": -1.5599935054779053, "logits/rejected": -1.0608078241348267, "logps/chosen": -236.848876953125, "logps/rejected": -287.7281188964844, "loss": 0.5746, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.042720407247543335, "rewards/margins": 0.3582765460014343, "rewards/margins_max": 0.6046980619430542, "rewards/margins_min": 0.11185508966445923, "rewards/margins_std": 0.3484925925731659, "rewards/rejected": -0.315556138753891, "step": 330 }, { "epoch": 0.11, "grad_norm": 0.55078125, "learning_rate": 4.99926798914914e-06, "logits/chosen": -1.3914912939071655, "logits/rejected": -1.0357228517532349, "logps/chosen": -183.1966552734375, "logps/rejected": -208.01705932617188, "loss": 0.5743, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.006871527526527643, "rewards/margins": 0.2630093991756439, "rewards/margins_max": 0.37292978167533875, "rewards/margins_min": 0.1530890315771103, "rewards/margins_std": 0.15545089542865753, "rewards/rejected": -0.2698809504508972, "step": 340 }, { "epoch": 0.11, "grad_norm": 0.765625, "learning_rate": 4.9984513671541735e-06, "logits/chosen": -1.4279831647872925, "logits/rejected": -0.9789050817489624, "logps/chosen": -265.3249206542969, "logps/rejected": -276.2162780761719, "loss": 0.54, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.011723552830517292, "rewards/margins": 0.3322909474372864, "rewards/margins_max": 0.561407208442688, "rewards/margins_min": 0.10317464172840118, "rewards/margins_std": 0.3240194022655487, "rewards/rejected": -0.3440144658088684, "step": 350 }, { "epoch": 0.11, "grad_norm": 0.640625, "learning_rate": 4.997332437005932e-06, "logits/chosen": -1.7668163776397705, "logits/rejected": -1.3288036584854126, "logps/chosen": -282.70648193359375, "logps/rejected": -300.6052551269531, "loss": 0.5082, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.04692164808511734, "rewards/margins": 0.7383378744125366, "rewards/margins_max": 1.2071880102157593, "rewards/margins_min": 0.26948752999305725, "rewards/margins_std": 0.663054347038269, "rewards/rejected": -0.7852594256401062, "step": 360 }, { "epoch": 0.12, "grad_norm": 0.62109375, "learning_rate": 4.995911334092963e-06, "logits/chosen": -1.5333877801895142, "logits/rejected": -1.1322664022445679, "logps/chosen": -240.31625366210938, "logps/rejected": -247.5854034423828, "loss": 0.4881, "rewards/accuracies": 1.0, "rewards/chosen": -0.0783635601401329, "rewards/margins": 0.6869125366210938, "rewards/margins_max": 1.07400643825531, "rewards/margins_min": 0.2998184263706207, "rewards/margins_std": 0.547433614730835, "rewards/rejected": -0.7652760744094849, "step": 370 }, { "epoch": 0.12, "grad_norm": 0.58984375, "learning_rate": 4.994188230366184e-06, "logits/chosen": -1.4324274063110352, "logits/rejected": -1.1852810382843018, "logps/chosen": -240.2544708251953, "logps/rejected": -266.61505126953125, "loss": 0.4838, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.10970436036586761, "rewards/margins": 0.7477412223815918, "rewards/margins_max": 1.18809974193573, "rewards/margins_min": 0.3073826730251312, "rewards/margins_std": 0.6227611303329468, "rewards/rejected": -0.8574455976486206, "step": 380 }, { "epoch": 0.12, "grad_norm": 0.75390625, "learning_rate": 4.9921633343180655e-06, "logits/chosen": -1.3936595916748047, "logits/rejected": -0.9535585641860962, "logps/chosen": -293.89422607421875, "logps/rejected": -426.3384704589844, "loss": 0.4581, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2569817900657654, "rewards/margins": 1.8755642175674438, "rewards/margins_max": 3.503901720046997, "rewards/margins_min": 0.24722722172737122, "rewards/margins_std": 2.302816152572632, "rewards/rejected": -2.1325459480285645, "step": 390 }, { "epoch": 0.13, "grad_norm": 0.6875, "learning_rate": 4.989836890957415e-06, "logits/chosen": -1.4002983570098877, "logits/rejected": -1.0570244789123535, "logps/chosen": -233.2489471435547, "logps/rejected": -288.95269775390625, "loss": 0.4605, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.22145280241966248, "rewards/margins": 0.6735883355140686, "rewards/margins_max": 1.1278815269470215, "rewards/margins_min": 0.2192951738834381, "rewards/margins_std": 0.6424675583839417, "rewards/rejected": -0.895041286945343, "step": 400 }, { "epoch": 0.13, "grad_norm": 2.265625, "learning_rate": 4.987209181779722e-06, "logits/chosen": -1.5520060062408447, "logits/rejected": -1.2624719142913818, "logps/chosen": -232.98684692382812, "logps/rejected": -404.1191711425781, "loss": 0.4523, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.3905092179775238, "rewards/margins": 1.8865032196044922, "rewards/margins_max": 3.478367328643799, "rewards/margins_min": 0.2946400046348572, "rewards/margins_std": 2.251234769821167, "rewards/rejected": -2.277012586593628, "step": 410 }, { "epoch": 0.13, "grad_norm": 1.734375, "learning_rate": 4.984280524733107e-06, "logits/chosen": -1.4704313278198242, "logits/rejected": -1.0447412729263306, "logps/chosen": -309.2225036621094, "logps/rejected": -504.7037048339844, "loss": 0.3563, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.5077094435691833, "rewards/margins": 2.0941860675811768, "rewards/margins_max": 3.6239044666290283, "rewards/margins_min": 0.5644680261611938, "rewards/margins_std": 2.1633481979370117, "rewards/rejected": -2.601895332336426, "step": 420 }, { "epoch": 0.14, "grad_norm": 1.5546875, "learning_rate": 4.98105127417984e-06, "logits/chosen": -1.4012444019317627, "logits/rejected": -1.0951766967773438, "logps/chosen": -325.9887390136719, "logps/rejected": -676.073974609375, "loss": 0.4123, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.6459981799125671, "rewards/margins": 3.602614164352417, "rewards/margins_max": 7.004115104675293, "rewards/margins_min": 0.2011127471923828, "rewards/margins_std": 4.810449600219727, "rewards/rejected": -4.248612403869629, "step": 430 }, { "epoch": 0.14, "grad_norm": 1.0703125, "learning_rate": 4.9775218208534706e-06, "logits/chosen": -1.5487967729568481, "logits/rejected": -1.0010716915130615, "logps/chosen": -279.5751037597656, "logps/rejected": -468.4371032714844, "loss": 0.3252, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.7016158699989319, "rewards/margins": 1.9516023397445679, "rewards/margins_max": 3.2723541259765625, "rewards/margins_min": 0.6308507919311523, "rewards/margins_std": 1.8678247928619385, "rewards/rejected": -2.6532185077667236, "step": 440 }, { "epoch": 0.14, "grad_norm": 1.1640625, "learning_rate": 4.973692591811549e-06, "logits/chosen": -1.3033084869384766, "logits/rejected": -1.1313459873199463, "logps/chosen": -299.37701416015625, "logps/rejected": -767.0716552734375, "loss": 0.3568, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.8542273640632629, "rewards/margins": 4.509963035583496, "rewards/margins_max": 7.8178558349609375, "rewards/margins_min": 1.2020692825317383, "rewards/margins_std": 4.678068161010742, "rewards/rejected": -5.364189624786377, "step": 450 }, { "epoch": 0.14, "grad_norm": 0.66796875, "learning_rate": 4.9695640503839495e-06, "logits/chosen": -1.2340087890625, "logits/rejected": -0.912117600440979, "logps/chosen": -280.4734802246094, "logps/rejected": -791.8204345703125, "loss": 0.2803, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.067180871963501, "rewards/margins": 4.743367671966553, "rewards/margins_max": 8.377991676330566, "rewards/margins_min": 1.1087446212768555, "rewards/margins_std": 5.140133857727051, "rewards/rejected": -5.810548782348633, "step": 460 }, { "epoch": 0.15, "grad_norm": 0.77734375, "learning_rate": 4.965136696116812e-06, "logits/chosen": -1.3413738012313843, "logits/rejected": -0.9947126507759094, "logps/chosen": -369.73773193359375, "logps/rejected": -1150.2530517578125, "loss": 0.2378, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.5380936861038208, "rewards/margins": 7.368071556091309, "rewards/margins_max": 13.355749130249023, "rewards/margins_min": 1.380393385887146, "rewards/margins_std": 8.467855453491211, "rewards/rejected": -8.90616512298584, "step": 470 }, { "epoch": 0.15, "grad_norm": 0.51953125, "learning_rate": 4.960411064712095e-06, "logits/chosen": -1.274917721748352, "logits/rejected": -0.9616276025772095, "logps/chosen": -348.9794616699219, "logps/rejected": -1353.219482421875, "loss": 0.3372, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6370916366577148, "rewards/margins": 9.717118263244629, "rewards/margins_max": 17.675342559814453, "rewards/margins_min": 1.758894681930542, "rewards/margins_std": 11.254626274108887, "rewards/rejected": -11.354207992553711, "step": 480 }, { "epoch": 0.15, "grad_norm": 1.21875, "learning_rate": 4.95538772796276e-06, "logits/chosen": -1.451219916343689, "logits/rejected": -1.2027828693389893, "logps/chosen": -529.2562866210938, "logps/rejected": -1177.658203125, "loss": 1.0878, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.522387742996216, "rewards/margins": 6.5533647537231445, "rewards/margins_max": 15.11187744140625, "rewards/margins_min": -2.0051493644714355, "rewards/margins_std": 12.103565216064453, "rewards/rejected": -10.075750350952148, "step": 490 }, { "epoch": 0.16, "grad_norm": 1.6640625, "learning_rate": 4.95006729368358e-06, "logits/chosen": -1.532649040222168, "logits/rejected": -1.1228234767913818, "logps/chosen": -519.6512451171875, "logps/rejected": -1331.0703125, "loss": 0.4919, "rewards/accuracies": 0.875, "rewards/chosen": -3.026210308074951, "rewards/margins": 8.242622375488281, "rewards/margins_max": 15.09514045715332, "rewards/margins_min": 1.3901066780090332, "rewards/margins_std": 9.690921783447266, "rewards/rejected": -11.26883316040039, "step": 500 }, { "epoch": 0.16, "grad_norm": 14.1875, "learning_rate": 4.944450405637603e-06, "logits/chosen": -1.340835452079773, "logits/rejected": -1.0451310873031616, "logps/chosen": -471.9386291503906, "logps/rejected": -1201.742431640625, "loss": 0.5528, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.621904134750366, "rewards/margins": 7.457037448883057, "rewards/margins_max": 13.327784538269043, "rewards/margins_min": 1.5862904787063599, "rewards/margins_std": 8.302489280700684, "rewards/rejected": -10.078941345214844, "step": 510 }, { "epoch": 0.16, "grad_norm": 2.3125, "learning_rate": 4.938537743458248e-06, "logits/chosen": -1.366683006286621, "logits/rejected": -1.0551526546478271, "logps/chosen": -337.539794921875, "logps/rejected": -1212.2384033203125, "loss": 0.389, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.5652153491973877, "rewards/margins": 8.746683120727539, "rewards/margins_max": 16.385772705078125, "rewards/margins_min": 1.1075944900512695, "rewards/margins_std": 10.803304672241211, "rewards/rejected": -10.311899185180664, "step": 520 }, { "epoch": 0.17, "grad_norm": 2.234375, "learning_rate": 4.932330022567083e-06, "logits/chosen": -1.3100754022598267, "logits/rejected": -1.0806552171707153, "logps/chosen": -364.4188537597656, "logps/rejected": -860.2052001953125, "loss": 0.2295, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.5020630359649658, "rewards/margins": 5.068886756896973, "rewards/margins_max": 7.5261735916137695, "rewards/margins_min": 2.611598491668701, "rewards/margins_std": 3.4751293659210205, "rewards/rejected": -6.570949554443359, "step": 530 }, { "epoch": 0.17, "grad_norm": 2.015625, "learning_rate": 4.925827994087245e-06, "logits/chosen": -1.4696404933929443, "logits/rejected": -0.9993557929992676, "logps/chosen": -405.4161682128906, "logps/rejected": -894.1640625, "loss": 0.3182, "rewards/accuracies": 1.0, "rewards/chosen": -2.0658953189849854, "rewards/margins": 4.750572204589844, "rewards/margins_max": 8.305654525756836, "rewards/margins_min": 1.195489764213562, "rewards/margins_std": 5.027646064758301, "rewards/rejected": -6.81646728515625, "step": 540 }, { "epoch": 0.17, "grad_norm": 1.8046875, "learning_rate": 4.91903244475257e-06, "logits/chosen": -1.4830572605133057, "logits/rejected": -1.27045738697052, "logps/chosen": -477.55389404296875, "logps/rejected": -1366.124267578125, "loss": 0.3149, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.4173495769500732, "rewards/margins": 9.214771270751953, "rewards/margins_max": 13.362762451171875, "rewards/margins_min": 5.066779613494873, "rewards/margins_std": 5.866146087646484, "rewards/rejected": -11.632120132446289, "step": 550 }, { "epoch": 0.18, "grad_norm": 1.5078125, "learning_rate": 4.911944196812391e-06, "logits/chosen": -1.428005576133728, "logits/rejected": -1.153662919998169, "logps/chosen": -409.9786071777344, "logps/rejected": -1460.192626953125, "loss": 0.5181, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.0635440349578857, "rewards/margins": 10.500307083129883, "rewards/margins_max": 19.511022567749023, "rewards/margins_min": 1.4895923137664795, "rewards/margins_std": 12.743075370788574, "rewards/rejected": -12.563852310180664, "step": 560 }, { "epoch": 0.18, "grad_norm": 0.70703125, "learning_rate": 4.904564107932048e-06, "logits/chosen": -1.2749096155166626, "logits/rejected": -0.9493370056152344, "logps/chosen": -543.0249633789062, "logps/rejected": -1391.12158203125, "loss": 0.2271, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.694972515106201, "rewards/margins": 8.827417373657227, "rewards/margins_max": 16.37221336364746, "rewards/margins_min": 1.2826213836669922, "rewards/margins_std": 10.669951438903809, "rewards/rejected": -11.522390365600586, "step": 570 }, { "epoch": 0.18, "grad_norm": 0.875, "learning_rate": 4.896893071089116e-06, "logits/chosen": -1.375417947769165, "logits/rejected": -1.1153507232666016, "logps/chosen": -440.833984375, "logps/rejected": -1427.0830078125, "loss": 0.1884, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.082918643951416, "rewards/margins": 9.738504409790039, "rewards/margins_max": 14.975610733032227, "rewards/margins_min": 4.501399517059326, "rewards/margins_std": 7.4063849449157715, "rewards/rejected": -11.821423530578613, "step": 580 }, { "epoch": 0.19, "grad_norm": 0.83984375, "learning_rate": 4.8889320144653525e-06, "logits/chosen": -1.3201481103897095, "logits/rejected": -1.0787646770477295, "logps/chosen": -460.324951171875, "logps/rejected": -1797.281982421875, "loss": 0.1839, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.4998037815093994, "rewards/margins": 13.246177673339844, "rewards/margins_max": 23.318471908569336, "rewards/margins_min": 3.1738815307617188, "rewards/margins_std": 14.244379043579102, "rewards/rejected": -15.74598217010498, "step": 590 }, { "epoch": 0.19, "grad_norm": 1.390625, "learning_rate": 4.88068190133439e-06, "logits/chosen": -1.4633710384368896, "logits/rejected": -1.2496354579925537, "logps/chosen": -714.4942016601562, "logps/rejected": -2540.6845703125, "loss": 0.3476, "rewards/accuracies": 0.875, "rewards/chosen": -4.186011791229248, "rewards/margins": 18.637645721435547, "rewards/margins_max": 33.042198181152344, "rewards/margins_min": 4.233096122741699, "rewards/margins_std": 20.371110916137695, "rewards/rejected": -22.823659896850586, "step": 600 }, { "epoch": 0.19, "grad_norm": 1.21875, "learning_rate": 4.872143729945185e-06, "logits/chosen": -1.1824166774749756, "logits/rejected": -0.8578090667724609, "logps/chosen": -540.1989135742188, "logps/rejected": -1778.9107666015625, "loss": 0.2531, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.1951847076416016, "rewards/margins": 12.686628341674805, "rewards/margins_max": 22.70194435119629, "rewards/margins_min": 2.6713125705718994, "rewards/margins_std": 14.163797378540039, "rewards/rejected": -15.881813049316406, "step": 610 }, { "epoch": 0.2, "grad_norm": 0.85546875, "learning_rate": 4.863318533401224e-06, "logits/chosen": -1.3596642017364502, "logits/rejected": -1.0070207118988037, "logps/chosen": -572.4644775390625, "logps/rejected": -1767.1966552734375, "loss": 0.2573, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.2376201152801514, "rewards/margins": 11.738923072814941, "rewards/margins_max": 19.70199966430664, "rewards/margins_min": 3.7758469581604004, "rewards/margins_std": 11.261490821838379, "rewards/rejected": -14.976544380187988, "step": 620 }, { "epoch": 0.2, "grad_norm": 0.64453125, "learning_rate": 4.85420737953553e-06, "logits/chosen": -1.5254818201065063, "logits/rejected": -1.1172149181365967, "logps/chosen": -552.1317138671875, "logps/rejected": -1063.9925537109375, "loss": 0.491, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.961021900177002, "rewards/margins": 5.317422389984131, "rewards/margins_max": 10.766054153442383, "rewards/margins_min": -0.13120803236961365, "rewards/margins_std": 7.705528259277344, "rewards/rejected": -8.27844524383545, "step": 630 }, { "epoch": 0.2, "grad_norm": 1.046875, "learning_rate": 4.844811370781446e-06, "logits/chosen": -1.489282250404358, "logits/rejected": -1.1010894775390625, "logps/chosen": -437.63519287109375, "logps/rejected": -1001.1755981445312, "loss": 0.3419, "rewards/accuracies": 0.875, "rewards/chosen": -1.8960500955581665, "rewards/margins": 5.868886470794678, "rewards/margins_max": 9.640433311462402, "rewards/margins_min": 2.0973384380340576, "rewards/margins_std": 5.333773612976074, "rewards/rejected": -7.7649359703063965, "step": 640 }, { "epoch": 0.2, "grad_norm": 1.03125, "learning_rate": 4.835131644039251e-06, "logits/chosen": -1.585998296737671, "logits/rejected": -1.0765646696090698, "logps/chosen": -624.9088134765625, "logps/rejected": -1340.5115966796875, "loss": 0.2811, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.8306846618652344, "rewards/margins": 8.347230911254883, "rewards/margins_max": 14.0307035446167, "rewards/margins_min": 2.6637587547302246, "rewards/margins_std": 8.037643432617188, "rewards/rejected": -11.177915573120117, "step": 650 }, { "epoch": 0.21, "grad_norm": 2.6875, "learning_rate": 4.825169370538595e-06, "logits/chosen": -1.3587524890899658, "logits/rejected": -1.200235366821289, "logps/chosen": -486.85107421875, "logps/rejected": -1671.8179931640625, "loss": 0.4252, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.5282440185546875, "rewards/margins": 11.739642143249512, "rewards/margins_max": 22.0147705078125, "rewards/margins_min": 1.4645134210586548, "rewards/margins_std": 14.531225204467773, "rewards/rejected": -14.2678861618042, "step": 660 }, { "epoch": 0.21, "grad_norm": 2.140625, "learning_rate": 4.8149257556967776e-06, "logits/chosen": -1.5342085361480713, "logits/rejected": -1.1026307344436646, "logps/chosen": -690.6807861328125, "logps/rejected": -1562.658935546875, "loss": 0.4338, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.9973278045654297, "rewards/margins": 9.116189956665039, "rewards/margins_max": 16.03341293334961, "rewards/margins_min": 2.198969841003418, "rewards/margins_std": 9.782427787780762, "rewards/rejected": -13.113519668579102, "step": 670 }, { "epoch": 0.21, "grad_norm": 2.796875, "learning_rate": 4.8044020389729e-06, "logits/chosen": -1.4870555400848389, "logits/rejected": -1.135543704032898, "logps/chosen": -645.024658203125, "logps/rejected": -1609.5645751953125, "loss": 0.2779, "rewards/accuracies": 0.875, "rewards/chosen": -3.703730821609497, "rewards/margins": 9.667627334594727, "rewards/margins_max": 17.878780364990234, "rewards/margins_min": 1.456475853919983, "rewards/margins_std": 11.612321853637695, "rewards/rejected": -13.371357917785645, "step": 680 }, { "epoch": 0.22, "grad_norm": 2.859375, "learning_rate": 4.793599493717891e-06, "logits/chosen": -1.3815220594406128, "logits/rejected": -1.0400980710983276, "logps/chosen": -551.9547119140625, "logps/rejected": -1804.0220947265625, "loss": 0.4267, "rewards/accuracies": 0.875, "rewards/chosen": -2.923583745956421, "rewards/margins": 12.87585163116455, "rewards/margins_max": 23.543264389038086, "rewards/margins_min": 2.208437919616699, "rewards/margins_std": 15.086000442504883, "rewards/rejected": -15.79943561553955, "step": 690 }, { "epoch": 0.22, "grad_norm": 5.34375, "learning_rate": 4.782519427020434e-06, "logits/chosen": -1.2928307056427002, "logits/rejected": -1.0188380479812622, "logps/chosen": -466.04290771484375, "logps/rejected": -1075.211181640625, "loss": 0.4064, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.593552350997925, "rewards/margins": 5.988112449645996, "rewards/margins_max": 10.20189094543457, "rewards/margins_min": 1.7743346691131592, "rewards/margins_std": 5.959182262420654, "rewards/rejected": -8.5816650390625, "step": 700 }, { "epoch": 0.22, "grad_norm": 3.03125, "learning_rate": 4.771163179548809e-06, "logits/chosen": -1.4908405542373657, "logits/rejected": -1.0522329807281494, "logps/chosen": -562.4791870117188, "logps/rejected": -1309.4808349609375, "loss": 0.1415, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.262165069580078, "rewards/margins": 8.532692909240723, "rewards/margins_max": 13.401638984680176, "rewards/margins_min": 3.6637446880340576, "rewards/margins_std": 6.8857316970825195, "rewards/rejected": -10.7948579788208, "step": 710 }, { "epoch": 0.23, "grad_norm": 0.29296875, "learning_rate": 4.759532125388681e-06, "logits/chosen": -1.2453266382217407, "logits/rejected": -0.9683879613876343, "logps/chosen": -486.449462890625, "logps/rejected": -1878.889892578125, "loss": 0.2135, "rewards/accuracies": 0.875, "rewards/chosen": -2.1956615447998047, "rewards/margins": 14.269218444824219, "rewards/margins_max": 24.373987197875977, "rewards/margins_min": 4.164450168609619, "rewards/margins_std": 14.290300369262695, "rewards/rejected": -16.464879989624023, "step": 720 }, { "epoch": 0.23, "grad_norm": 4.3125, "learning_rate": 4.747627671876829e-06, "logits/chosen": -1.5221184492111206, "logits/rejected": -1.213921308517456, "logps/chosen": -411.44805908203125, "logps/rejected": -1289.133056640625, "loss": 0.2008, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.027691125869751, "rewards/margins": 8.689915657043457, "rewards/margins_max": 13.808636665344238, "rewards/margins_min": 3.5711944103240967, "rewards/margins_std": 7.2389655113220215, "rewards/rejected": -10.717607498168945, "step": 730 }, { "epoch": 0.23, "grad_norm": 1.390625, "learning_rate": 4.735451259430866e-06, "logits/chosen": -1.4237302541732788, "logits/rejected": -1.3012243509292603, "logps/chosen": -456.906982421875, "logps/rejected": -1476.89453125, "loss": 0.2935, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.6124205589294434, "rewards/margins": 10.188555717468262, "rewards/margins_max": 16.66933250427246, "rewards/margins_min": 3.707778215408325, "rewards/margins_std": 9.165203094482422, "rewards/rejected": -12.80097484588623, "step": 740 }, { "epoch": 0.24, "grad_norm": 8.5625, "learning_rate": 4.723004361374953e-06, "logits/chosen": -1.3836435079574585, "logits/rejected": -1.198193907737732, "logps/chosen": -473.876220703125, "logps/rejected": -1625.8421630859375, "loss": 0.2865, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.4213483333587646, "rewards/margins": 11.857926368713379, "rewards/margins_max": 21.100528717041016, "rewards/margins_min": 2.615323543548584, "rewards/margins_std": 13.071016311645508, "rewards/rejected": -14.279275894165039, "step": 750 }, { "epoch": 0.24, "grad_norm": 10.4375, "learning_rate": 4.710288483761524e-06, "logits/chosen": -1.2236430644989014, "logits/rejected": -0.9467355012893677, "logps/chosen": -524.3106689453125, "logps/rejected": -1654.451416015625, "loss": 0.3292, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.8375871181488037, "rewards/margins": 11.601432800292969, "rewards/margins_max": 20.039447784423828, "rewards/margins_min": 3.1634178161621094, "rewards/margins_std": 11.933156967163086, "rewards/rejected": -14.439020156860352, "step": 760 }, { "epoch": 0.24, "grad_norm": 5.3125, "learning_rate": 4.697305165189062e-06, "logits/chosen": -1.4987045526504517, "logits/rejected": -1.1748701333999634, "logps/chosen": -520.372314453125, "logps/rejected": -1442.008056640625, "loss": 0.2521, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.8685879707336426, "rewards/margins": 9.27059268951416, "rewards/margins_max": 16.574522018432617, "rewards/margins_min": 1.9666624069213867, "rewards/margins_std": 10.329317092895508, "rewards/rejected": -12.139180183410645, "step": 770 }, { "epoch": 0.25, "grad_norm": 0.875, "learning_rate": 4.684055976615924e-06, "logits/chosen": -1.4547721147537231, "logits/rejected": -1.1212984323501587, "logps/chosen": -433.8109436035156, "logps/rejected": -1971.857177734375, "loss": 0.1387, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.043619394302368, "rewards/margins": 15.325085639953613, "rewards/margins_max": 24.84339714050293, "rewards/margins_min": 5.806773662567139, "rewards/margins_std": 13.460925102233887, "rewards/rejected": -17.368703842163086, "step": 780 }, { "epoch": 0.25, "grad_norm": 33.75, "learning_rate": 4.670542521170266e-06, "logits/chosen": -1.5136555433273315, "logits/rejected": -1.2549117803573608, "logps/chosen": -411.322509765625, "logps/rejected": -1319.593994140625, "loss": 0.3371, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.360672950744629, "rewards/margins": 8.951146125793457, "rewards/margins_max": 15.15422534942627, "rewards/margins_min": 2.7480692863464355, "rewards/margins_std": 8.772477149963379, "rewards/rejected": -11.311819076538086, "step": 790 }, { "epoch": 0.25, "grad_norm": 1.1015625, "learning_rate": 4.656766433956063e-06, "logits/chosen": -1.4924864768981934, "logits/rejected": -1.0543395280838013, "logps/chosen": -545.2259521484375, "logps/rejected": -1888.3046875, "loss": 0.1707, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9292566776275635, "rewards/margins": 13.785478591918945, "rewards/margins_max": 23.153963088989258, "rewards/margins_min": 4.4169921875, "rewards/margins_std": 13.249038696289062, "rewards/rejected": -16.71473503112793, "step": 800 }, { "epoch": 0.26, "grad_norm": 6.125, "learning_rate": 4.642729381855262e-06, "logits/chosen": -1.559021234512329, "logits/rejected": -1.1420828104019165, "logps/chosen": -477.46795654296875, "logps/rejected": -1219.603271484375, "loss": 0.3419, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.3907387256622314, "rewards/margins": 7.980464935302734, "rewards/margins_max": 14.719846725463867, "rewards/margins_min": 1.2410833835601807, "rewards/margins_std": 9.530924797058105, "rewards/rejected": -10.371204376220703, "step": 810 }, { "epoch": 0.26, "grad_norm": 14.5, "learning_rate": 4.6284330633261e-06, "logits/chosen": -1.3948744535446167, "logits/rejected": -1.1108922958374023, "logps/chosen": -548.1687622070312, "logps/rejected": -1504.393310546875, "loss": 0.3685, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.439661741256714, "rewards/margins": 9.631498336791992, "rewards/margins_max": 16.037883758544922, "rewards/margins_min": 3.225111484527588, "rewards/margins_std": 9.059998512268066, "rewards/rejected": -13.071161270141602, "step": 820 }, { "epoch": 0.26, "grad_norm": 2.328125, "learning_rate": 4.613879208197585e-06, "logits/chosen": -1.5194332599639893, "logits/rejected": -1.1743465662002563, "logps/chosen": -473.6636657714844, "logps/rejected": -1417.037841796875, "loss": 0.1998, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.4681811332702637, "rewards/margins": 9.82261848449707, "rewards/margins_max": 16.545597076416016, "rewards/margins_min": 3.099639415740967, "rewards/margins_std": 9.507728576660156, "rewards/rejected": -12.290799140930176, "step": 830 }, { "epoch": 0.26, "grad_norm": 1.03125, "learning_rate": 4.5990695774601945e-06, "logits/chosen": -1.529234766960144, "logits/rejected": -1.3274476528167725, "logps/chosen": -542.2566528320312, "logps/rejected": -1780.8724365234375, "loss": 0.1955, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.5734362602233887, "rewards/margins": 12.809934616088867, "rewards/margins_max": 22.373729705810547, "rewards/margins_min": 3.24613618850708, "rewards/margins_std": 13.525251388549805, "rewards/rejected": -15.383371353149414, "step": 840 }, { "epoch": 0.27, "grad_norm": 1.046875, "learning_rate": 4.584005963052799e-06, "logits/chosen": -1.6547048091888428, "logits/rejected": -1.295466423034668, "logps/chosen": -469.98529052734375, "logps/rejected": -1244.710205078125, "loss": 0.217, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.6301162242889404, "rewards/margins": 7.640155792236328, "rewards/margins_max": 13.174588203430176, "rewards/margins_min": 2.1057231426239014, "rewards/margins_std": 7.826869964599609, "rewards/rejected": -10.270272254943848, "step": 850 }, { "epoch": 0.27, "grad_norm": 1.3515625, "learning_rate": 4.568690187645839e-06, "logits/chosen": -1.4834959506988525, "logits/rejected": -1.2343547344207764, "logps/chosen": -644.5606689453125, "logps/rejected": -1773.735107421875, "loss": 0.2402, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.305816650390625, "rewards/margins": 11.198797225952148, "rewards/margins_max": 19.111894607543945, "rewards/margins_min": 3.285700559616089, "rewards/margins_std": 11.19080924987793, "rewards/rejected": -15.504613876342773, "step": 860 }, { "epoch": 0.27, "grad_norm": 12.0, "learning_rate": 4.553124104420784e-06, "logits/chosen": -1.4060771465301514, "logits/rejected": -1.2946099042892456, "logps/chosen": -648.6322631835938, "logps/rejected": -2310.56787109375, "loss": 0.2248, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.409775257110596, "rewards/margins": 16.627094268798828, "rewards/margins_max": 29.272930145263672, "rewards/margins_min": 3.981257677078247, "rewards/margins_std": 17.883914947509766, "rewards/rejected": -21.036869049072266, "step": 870 }, { "epoch": 0.28, "grad_norm": 1.453125, "learning_rate": 4.537309596845906e-06, "logits/chosen": -1.5412516593933105, "logits/rejected": -1.3352028131484985, "logps/chosen": -582.1204223632812, "logps/rejected": -2280.7783203125, "loss": 0.1607, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.755678653717041, "rewards/margins": 17.23660659790039, "rewards/margins_max": 28.5272216796875, "rewards/margins_min": 5.945991516113281, "rewards/margins_std": 15.967341423034668, "rewards/rejected": -20.99228286743164, "step": 880 }, { "epoch": 0.28, "grad_norm": 2.15625, "learning_rate": 4.521248578448374e-06, "logits/chosen": -1.396698236465454, "logits/rejected": -1.3860348463058472, "logps/chosen": -543.8904418945312, "logps/rejected": -1417.26904296875, "loss": 0.2369, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.744899034500122, "rewards/margins": 8.08409595489502, "rewards/margins_max": 12.590652465820312, "rewards/margins_min": 3.577538251876831, "rewards/margins_std": 6.373233795166016, "rewards/rejected": -11.828994750976562, "step": 890 }, { "epoch": 0.28, "grad_norm": 4.6875, "learning_rate": 4.504942992582732e-06, "logits/chosen": -1.3976601362228394, "logits/rejected": -1.242203712463379, "logps/chosen": -657.7681884765625, "logps/rejected": -2024.028564453125, "loss": 0.3438, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.529778480529785, "rewards/margins": 13.55543327331543, "rewards/margins_max": 23.787803649902344, "rewards/margins_min": 3.3230679035186768, "rewards/margins_std": 14.470751762390137, "rewards/rejected": -18.085216522216797, "step": 900 }, { "epoch": 0.29, "grad_norm": 2.671875, "learning_rate": 4.488394812195749e-06, "logits/chosen": -1.476554036140442, "logits/rejected": -1.3132295608520508, "logps/chosen": -531.2057495117188, "logps/rejected": -1767.5189208984375, "loss": 0.33, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.5785250663757324, "rewards/margins": 11.893239974975586, "rewards/margins_max": 20.438159942626953, "rewards/margins_min": 3.3483211994171143, "rewards/margins_std": 12.084342002868652, "rewards/rejected": -15.471768379211426, "step": 910 }, { "epoch": 0.29, "grad_norm": 1.171875, "learning_rate": 4.471606039587696e-06, "logits/chosen": -1.5740488767623901, "logits/rejected": -1.4436171054840088, "logps/chosen": -544.67138671875, "logps/rejected": -1495.161865234375, "loss": 0.1441, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.913360118865967, "rewards/margins": 9.523843765258789, "rewards/margins_max": 15.644078254699707, "rewards/margins_min": 3.40360951423645, "rewards/margins_std": 8.655319213867188, "rewards/rejected": -12.437203407287598, "step": 920 }, { "epoch": 0.29, "grad_norm": 2.640625, "learning_rate": 4.454578706170075e-06, "logits/chosen": -1.5687984228134155, "logits/rejected": -1.1671384572982788, "logps/chosen": -530.4217529296875, "logps/rejected": -1573.7464599609375, "loss": 0.2936, "rewards/accuracies": 0.875, "rewards/chosen": -3.3494606018066406, "rewards/margins": 10.087861061096191, "rewards/margins_max": 17.137001037597656, "rewards/margins_min": 3.0387213230133057, "rewards/margins_std": 9.968989372253418, "rewards/rejected": -13.437321662902832, "step": 930 }, { "epoch": 0.3, "grad_norm": 3.328125, "learning_rate": 4.437314872219819e-06, "logits/chosen": -1.4016139507293701, "logits/rejected": -1.1822127103805542, "logps/chosen": -483.20831298828125, "logps/rejected": -1944.1021728515625, "loss": 0.1208, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.7656898498535156, "rewards/margins": 14.37073802947998, "rewards/margins_max": 24.48914337158203, "rewards/margins_min": 4.2523345947265625, "rewards/margins_std": 14.30958366394043, "rewards/rejected": -17.136428833007812, "step": 940 }, { "epoch": 0.3, "grad_norm": 0.06982421875, "learning_rate": 4.419816626630003e-06, "logits/chosen": -1.6272413730621338, "logits/rejected": -1.3043510913848877, "logps/chosen": -545.1571655273438, "logps/rejected": -1785.0166015625, "loss": 0.2053, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.1751797199249268, "rewards/margins": 12.30494499206543, "rewards/margins_max": 19.407062530517578, "rewards/margins_min": 5.202826499938965, "rewards/margins_std": 10.043911933898926, "rewards/rejected": -15.480123519897461, "step": 950 }, { "epoch": 0.3, "grad_norm": 0.78125, "learning_rate": 4.402086086657093e-06, "logits/chosen": -1.627579927444458, "logits/rejected": -1.1320421695709229, "logps/chosen": -429.28704833984375, "logps/rejected": -1020.9256591796875, "loss": 0.2996, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.296780586242676, "rewards/margins": 5.808542251586914, "rewards/margins_max": 9.648881912231445, "rewards/margins_min": 1.968202829360962, "rewards/margins_std": 5.431060314178467, "rewards/rejected": -8.10532283782959, "step": 960 }, { "epoch": 0.31, "grad_norm": 5.03125, "learning_rate": 4.384125397664759e-06, "logits/chosen": -1.5202645063400269, "logits/rejected": -1.2545684576034546, "logps/chosen": -431.06121826171875, "logps/rejected": -1594.2802734375, "loss": 0.2789, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.312934398651123, "rewards/margins": 11.746423721313477, "rewards/margins_max": 19.13467788696289, "rewards/margins_min": 4.358170509338379, "rewards/margins_std": 10.448568344116211, "rewards/rejected": -14.059358596801758, "step": 970 }, { "epoch": 0.31, "grad_norm": 11.25, "learning_rate": 4.365936732864292e-06, "logits/chosen": -1.279996395111084, "logits/rejected": -1.2075783014297485, "logps/chosen": -534.7310180664062, "logps/rejected": -1707.2154541015625, "loss": 0.2123, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.199549913406372, "rewards/margins": 11.328302383422852, "rewards/margins_max": 20.435813903808594, "rewards/margins_min": 2.2207882404327393, "rewards/margins_std": 12.879968643188477, "rewards/rejected": -14.527850151062012, "step": 980 }, { "epoch": 0.31, "grad_norm": 2.015625, "learning_rate": 4.3475222930516484e-06, "logits/chosen": -1.5118716955184937, "logits/rejected": -1.321613073348999, "logps/chosen": -475.66455078125, "logps/rejected": -1267.9703369140625, "loss": 0.2249, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.6406443119049072, "rewards/margins": 7.942468166351318, "rewards/margins_max": 13.136529922485352, "rewards/margins_min": 2.7484066486358643, "rewards/margins_std": 7.345511436462402, "rewards/rejected": -10.583112716674805, "step": 990 }, { "epoch": 0.32, "grad_norm": 2.546875, "learning_rate": 4.3288843063411576e-06, "logits/chosen": -1.6871601343154907, "logits/rejected": -1.2926992177963257, "logps/chosen": -519.3112182617188, "logps/rejected": -1428.3577880859375, "loss": 0.1899, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.0296714305877686, "rewards/margins": 9.275277137756348, "rewards/margins_max": 13.191515922546387, "rewards/margins_min": 5.359038352966309, "rewards/margins_std": 5.538398742675781, "rewards/rejected": -12.304948806762695, "step": 1000 }, { "epoch": 0.32, "grad_norm": 1.0703125, "learning_rate": 4.310025027895926e-06, "logits/chosen": -1.5426689386367798, "logits/rejected": -1.2699373960494995, "logps/chosen": -505.8729553222656, "logps/rejected": -1447.990478515625, "loss": 0.1406, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.8665452003479004, "rewards/margins": 9.338605880737305, "rewards/margins_max": 14.440671920776367, "rewards/margins_min": 4.236537456512451, "rewards/margins_std": 7.215413570404053, "rewards/rejected": -12.205150604248047, "step": 1010 }, { "epoch": 0.32, "grad_norm": 1.8046875, "learning_rate": 4.290946739654962e-06, "logits/chosen": -1.4005990028381348, "logits/rejected": -1.128671407699585, "logps/chosen": -584.1080322265625, "logps/rejected": -1965.639404296875, "loss": 0.1263, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.352668285369873, "rewards/margins": 14.061731338500977, "rewards/margins_max": 23.202089309692383, "rewards/margins_min": 4.921371936798096, "rewards/margins_std": 12.926417350769043, "rewards/rejected": -17.414400100708008, "step": 1020 }, { "epoch": 0.32, "grad_norm": 0.75, "learning_rate": 4.271651750057071e-06, "logits/chosen": -1.5351276397705078, "logits/rejected": -1.365252137184143, "logps/chosen": -506.6732482910156, "logps/rejected": -1338.5911865234375, "loss": 0.1886, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.1733031272888184, "rewards/margins": 8.043719291687012, "rewards/margins_max": 12.65977954864502, "rewards/margins_min": 3.427661418914795, "rewards/margins_std": 6.5280914306640625, "rewards/rejected": -11.217023849487305, "step": 1030 }, { "epoch": 0.33, "grad_norm": 0.859375, "learning_rate": 4.252142393761534e-06, "logits/chosen": -1.5365070104599, "logits/rejected": -1.185517430305481, "logps/chosen": -580.3465576171875, "logps/rejected": -1935.516357421875, "loss": 0.136, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.2586402893066406, "rewards/margins": 13.273635864257812, "rewards/margins_max": 22.62195587158203, "rewards/margins_min": 3.9253132343292236, "rewards/margins_std": 13.2205228805542, "rewards/rejected": -16.532276153564453, "step": 1040 }, { "epoch": 0.33, "grad_norm": 0.71875, "learning_rate": 4.232421031365618e-06, "logits/chosen": -1.4389902353286743, "logits/rejected": -1.3525134325027466, "logps/chosen": -535.2704467773438, "logps/rejected": -1825.3232421875, "loss": 0.1853, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.515831708908081, "rewards/margins": 12.615926742553711, "rewards/margins_max": 18.764150619506836, "rewards/margins_min": 6.467702388763428, "rewards/margins_std": 8.694902420043945, "rewards/rejected": -16.131757736206055, "step": 1050 }, { "epoch": 0.33, "grad_norm": 0.09521484375, "learning_rate": 4.212490049118952e-06, "logits/chosen": -1.5485631227493286, "logits/rejected": -1.3416082859039307, "logps/chosen": -552.5711669921875, "logps/rejected": -1961.2366943359375, "loss": 0.1156, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.507843494415283, "rewards/margins": 13.728233337402344, "rewards/margins_max": 22.533226013183594, "rewards/margins_min": 4.923240661621094, "rewards/margins_std": 12.452141761779785, "rewards/rejected": -17.2360782623291, "step": 1060 }, { "epoch": 0.34, "grad_norm": 0.8984375, "learning_rate": 4.192351858634792e-06, "logits/chosen": -1.6128113269805908, "logits/rejected": -1.2383267879486084, "logps/chosen": -608.5562133789062, "logps/rejected": -1741.685546875, "loss": 0.4493, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.952550172805786, "rewards/margins": 11.562326431274414, "rewards/margins_max": 19.853296279907227, "rewards/margins_min": 3.271360397338867, "rewards/margins_std": 11.725197792053223, "rewards/rejected": -15.514877319335938, "step": 1070 }, { "epoch": 0.34, "grad_norm": 4.65625, "learning_rate": 4.172008896598221e-06, "logits/chosen": -1.497994303703308, "logits/rejected": -1.382777452468872, "logps/chosen": -619.6976928710938, "logps/rejected": -2020.6917724609375, "loss": 0.3711, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.149666786193848, "rewards/margins": 14.204757690429688, "rewards/margins_max": 22.320579528808594, "rewards/margins_min": 6.088934898376465, "rewards/margins_std": 11.477506637573242, "rewards/rejected": -18.35442352294922, "step": 1080 }, { "epoch": 0.34, "grad_norm": 0.19921875, "learning_rate": 4.151463624471314e-06, "logits/chosen": -1.464698076248169, "logits/rejected": -1.022950291633606, "logps/chosen": -577.0154418945312, "logps/rejected": -2056.116455078125, "loss": 0.1369, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.6164112091064453, "rewards/margins": 15.738482475280762, "rewards/margins_max": 25.909582138061523, "rewards/margins_min": 5.567382335662842, "rewards/margins_std": 14.384109497070312, "rewards/rejected": -18.354894638061523, "step": 1090 }, { "epoch": 0.35, "grad_norm": 1.4453125, "learning_rate": 4.130718528195303e-06, "logits/chosen": -1.6330950260162354, "logits/rejected": -1.2724729776382446, "logps/chosen": -528.8734130859375, "logps/rejected": -1879.766357421875, "loss": 0.1763, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.9474363327026367, "rewards/margins": 13.620790481567383, "rewards/margins_max": 21.610565185546875, "rewards/margins_min": 5.631014823913574, "rewards/margins_std": 11.299247741699219, "rewards/rejected": -16.568225860595703, "step": 1100 }, { "epoch": 0.35, "grad_norm": 5.15625, "learning_rate": 4.109776117889789e-06, "logits/chosen": -1.4906260967254639, "logits/rejected": -1.1496328115463257, "logps/chosen": -598.34228515625, "logps/rejected": -1853.970458984375, "loss": 0.2219, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.373750686645508, "rewards/margins": 12.560392379760742, "rewards/margins_max": 17.999202728271484, "rewards/margins_min": 7.121583461761475, "rewards/margins_std": 7.6916375160217285, "rewards/rejected": -15.93414306640625, "step": 1110 }, { "epoch": 0.35, "grad_norm": 1.046875, "learning_rate": 4.088638927549017e-06, "logits/chosen": -1.552268147468567, "logits/rejected": -1.2409183979034424, "logps/chosen": -649.3657836914062, "logps/rejected": -1703.074462890625, "loss": 0.2693, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.8895363807678223, "rewards/margins": 10.927635192871094, "rewards/margins_max": 17.817441940307617, "rewards/margins_min": 4.03782844543457, "rewards/margins_std": 9.743658065795898, "rewards/rejected": -14.817171096801758, "step": 1120 }, { "epoch": 0.36, "grad_norm": 1.4140625, "learning_rate": 4.067309514735267e-06, "logits/chosen": -1.3615977764129639, "logits/rejected": -1.1248162984848022, "logps/chosen": -608.7218017578125, "logps/rejected": -1841.6429443359375, "loss": 0.1213, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.523672580718994, "rewards/margins": 12.779653549194336, "rewards/margins_max": 18.22634506225586, "rewards/margins_min": 7.332962989807129, "rewards/margins_std": 7.702784061431885, "rewards/rejected": -16.303325653076172, "step": 1130 }, { "epoch": 0.36, "grad_norm": 6.9375, "learning_rate": 4.0457904602693954e-06, "logits/chosen": -1.4254987239837646, "logits/rejected": -1.137463927268982, "logps/chosen": -525.2374267578125, "logps/rejected": -1585.09326171875, "loss": 0.1182, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.995495319366455, "rewards/margins": 10.838014602661133, "rewards/margins_max": 18.608234405517578, "rewards/margins_min": 3.0677988529205322, "rewards/margins_std": 10.98874568939209, "rewards/rejected": -13.833511352539062, "step": 1140 }, { "epoch": 0.36, "grad_norm": 4.46875, "learning_rate": 4.0240843679185605e-06, "logits/chosen": -1.5120861530303955, "logits/rejected": -1.247182846069336, "logps/chosen": -617.8936157226562, "logps/rejected": -2085.58642578125, "loss": 0.1841, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.7910187244415283, "rewards/margins": 14.714996337890625, "rewards/margins_max": 24.271373748779297, "rewards/margins_min": 5.158619403839111, "rewards/margins_std": 13.51475715637207, "rewards/rejected": -18.50601577758789, "step": 1150 }, { "epoch": 0.37, "grad_norm": 0.77734375, "learning_rate": 4.002193864081172e-06, "logits/chosen": -1.4501639604568481, "logits/rejected": -1.2224775552749634, "logps/chosen": -546.1622924804688, "logps/rejected": -2053.220703125, "loss": 0.1854, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2076709270477295, "rewards/margins": 13.76832103729248, "rewards/margins_max": 21.275157928466797, "rewards/margins_min": 6.261481285095215, "rewards/margins_std": 10.6162748336792, "rewards/rejected": -16.97599220275879, "step": 1160 }, { "epoch": 0.37, "grad_norm": 1.8515625, "learning_rate": 3.980121597469096e-06, "logits/chosen": -1.5411275625228882, "logits/rejected": -1.25248122215271, "logps/chosen": -565.0699462890625, "logps/rejected": -1576.759033203125, "loss": 0.2283, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.4042811393737793, "rewards/margins": 10.423602104187012, "rewards/margins_max": 15.0601806640625, "rewards/margins_min": 5.787021636962891, "rewards/margins_std": 6.5571136474609375, "rewards/rejected": -13.827882766723633, "step": 1170 }, { "epoch": 0.37, "grad_norm": 25.875, "learning_rate": 3.9578702387871745e-06, "logits/chosen": -1.6382890939712524, "logits/rejected": -1.319797396659851, "logps/chosen": -584.9801025390625, "logps/rejected": -1850.2626953125, "loss": 0.2871, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.8053550720214844, "rewards/margins": 12.886405944824219, "rewards/margins_max": 21.918596267700195, "rewards/margins_min": 3.854217052459717, "rewards/margins_std": 12.773443222045898, "rewards/rejected": -16.691761016845703, "step": 1180 }, { "epoch": 0.37, "grad_norm": 1.015625, "learning_rate": 3.935442480410065e-06, "logits/chosen": -1.486549973487854, "logits/rejected": -1.2807599306106567, "logps/chosen": -483.4930725097656, "logps/rejected": -1720.572998046875, "loss": 0.2895, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.9897453784942627, "rewards/margins": 11.935190200805664, "rewards/margins_max": 20.719188690185547, "rewards/margins_min": 3.151189088821411, "rewards/margins_std": 12.422451972961426, "rewards/rejected": -14.924932479858398, "step": 1190 }, { "epoch": 0.38, "grad_norm": 2.28125, "learning_rate": 3.91284103605648e-06, "logits/chosen": -1.5874106884002686, "logits/rejected": -1.068820834159851, "logps/chosen": -526.4561767578125, "logps/rejected": -1309.350341796875, "loss": 0.1856, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.838336944580078, "rewards/margins": 7.996423244476318, "rewards/margins_max": 12.323980331420898, "rewards/margins_min": 3.668865919113159, "rewards/margins_std": 6.120089530944824, "rewards/rejected": -10.834760665893555, "step": 1200 }, { "epoch": 0.38, "grad_norm": 2.140625, "learning_rate": 3.890068640460818e-06, "logits/chosen": -1.563246488571167, "logits/rejected": -1.3833080530166626, "logps/chosen": -608.7943725585938, "logps/rejected": -1742.0953369140625, "loss": 0.2083, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.5897440910339355, "rewards/margins": 11.420188903808594, "rewards/margins_max": 18.479089736938477, "rewards/margins_min": 4.361289024353027, "rewards/margins_std": 9.98279094696045, "rewards/rejected": -15.009931564331055, "step": 1210 }, { "epoch": 0.38, "grad_norm": 4.78125, "learning_rate": 3.867128049042276e-06, "logits/chosen": -1.6497488021850586, "logits/rejected": -1.4546703100204468, "logps/chosen": -509.16265869140625, "logps/rejected": -1514.1629638671875, "loss": 0.1561, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.3210582733154297, "rewards/margins": 9.69025993347168, "rewards/margins_max": 15.015596389770508, "rewards/margins_min": 4.364924907684326, "rewards/margins_std": 7.531160831451416, "rewards/rejected": -13.011317253112793, "step": 1220 }, { "epoch": 0.39, "grad_norm": 2.734375, "learning_rate": 3.844022037571444e-06, "logits/chosen": -1.5822490453720093, "logits/rejected": -1.1622604131698608, "logps/chosen": -559.5164184570312, "logps/rejected": -1303.6251220703125, "loss": 0.2372, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.63049054145813, "rewards/margins": 7.552873134613037, "rewards/margins_max": 11.804709434509277, "rewards/margins_min": 3.3010354042053223, "rewards/margins_std": 6.013005256652832, "rewards/rejected": -11.18336296081543, "step": 1230 }, { "epoch": 0.39, "grad_norm": 2.25, "learning_rate": 3.820753401834444e-06, "logits/chosen": -1.6436933279037476, "logits/rejected": -1.4893285036087036, "logps/chosen": -522.2276611328125, "logps/rejected": -1459.512451171875, "loss": 0.1316, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.9450583457946777, "rewards/margins": 9.521982192993164, "rewards/margins_max": 13.79808235168457, "rewards/margins_min": 5.245881080627441, "rewards/margins_std": 6.047319412231445, "rewards/rejected": -12.467040061950684, "step": 1240 }, { "epoch": 0.39, "grad_norm": 1.3828125, "learning_rate": 3.7973249572946436e-06, "logits/chosen": -1.621744155883789, "logits/rejected": -1.3612353801727295, "logps/chosen": -539.4110107421875, "logps/rejected": -1551.703857421875, "loss": 0.1575, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.3887977600097656, "rewards/margins": 10.25967788696289, "rewards/margins_max": 18.161897659301758, "rewards/margins_min": 2.35745906829834, "rewards/margins_std": 11.17542552947998, "rewards/rejected": -13.648475646972656, "step": 1250 }, { "epoch": 0.4, "grad_norm": 1.4296875, "learning_rate": 3.7737395387519883e-06, "logits/chosen": -1.6151530742645264, "logits/rejected": -1.2519623041152954, "logps/chosen": -593.7537841796875, "logps/rejected": -1533.3704833984375, "loss": 0.1987, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.4480178356170654, "rewards/margins": 9.817560195922852, "rewards/margins_max": 14.997907638549805, "rewards/margins_min": 4.637210369110107, "rewards/margins_std": 7.326119422912598, "rewards/rejected": -13.26557731628418, "step": 1260 }, { "epoch": 0.4, "grad_norm": 1.40625, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -1.593546748161316, "logits/rejected": -1.227420449256897, "logps/chosen": -894.0062255859375, "logps/rejected": -1717.8193359375, "loss": 0.281, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.509779930114746, "rewards/margins": 9.543737411499023, "rewards/margins_max": 15.239468574523926, "rewards/margins_min": 3.8480052947998047, "rewards/margins_std": 8.054980278015137, "rewards/rejected": -15.05351734161377, "step": 1270 }, { "epoch": 0.4, "grad_norm": 1.375, "learning_rate": 3.7261092134804698e-06, "logits/chosen": -1.5079495906829834, "logits/rejected": -1.2428034543991089, "logps/chosen": -615.8125, "logps/rejected": -1781.083984375, "loss": 0.2198, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.0713090896606445, "rewards/margins": 11.638517379760742, "rewards/margins_max": 20.66924285888672, "rewards/margins_min": 2.6077933311462402, "rewards/margins_std": 12.771371841430664, "rewards/rejected": -15.709826469421387, "step": 1280 }, { "epoch": 0.41, "grad_norm": 0.72265625, "learning_rate": 3.7020700699358984e-06, "logits/chosen": -1.6448938846588135, "logits/rejected": -1.2866158485412598, "logps/chosen": -567.0323486328125, "logps/rejected": -1882.602294921875, "loss": 0.2352, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.3592464923858643, "rewards/margins": 13.268638610839844, "rewards/margins_max": 21.4252872467041, "rewards/margins_min": 5.111990928649902, "rewards/margins_std": 11.53524112701416, "rewards/rejected": -16.627885818481445, "step": 1290 }, { "epoch": 0.41, "grad_norm": 2.90625, "learning_rate": 3.6778854780597218e-06, "logits/chosen": -1.431904673576355, "logits/rejected": -1.2583922147750854, "logps/chosen": -548.6318359375, "logps/rejected": -1664.658203125, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": -3.2328219413757324, "rewards/margins": 11.606404304504395, "rewards/margins_max": 17.81925392150879, "rewards/margins_min": 5.393556594848633, "rewards/margins_std": 8.786293983459473, "rewards/rejected": -14.839225769042969, "step": 1300 }, { "epoch": 0.41, "grad_norm": 0.419921875, "learning_rate": 3.6535583641443634e-06, "logits/chosen": -1.6340068578720093, "logits/rejected": -1.5025604963302612, "logps/chosen": -582.7081298828125, "logps/rejected": -1899.066650390625, "loss": 0.4073, "rewards/accuracies": 0.875, "rewards/chosen": -3.9646008014678955, "rewards/margins": 12.86851978302002, "rewards/margins_max": 21.045665740966797, "rewards/margins_min": 4.691373348236084, "rewards/margins_std": 11.564231872558594, "rewards/rejected": -16.833120346069336, "step": 1310 }, { "epoch": 0.42, "grad_norm": 2.6875, "learning_rate": 3.6290916717271597e-06, "logits/chosen": -1.5056756734848022, "logits/rejected": -1.1574785709381104, "logps/chosen": -607.2431640625, "logps/rejected": -1871.325927734375, "loss": 0.1269, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.66899037361145, "rewards/margins": 12.744714736938477, "rewards/margins_max": 20.19748878479004, "rewards/margins_min": 5.2919416427612305, "rewards/margins_std": 10.539812088012695, "rewards/rejected": -16.41370391845703, "step": 1320 }, { "epoch": 0.42, "grad_norm": 0.70703125, "learning_rate": 3.604488361234196e-06, "logits/chosen": -1.6854197978973389, "logits/rejected": -1.4968090057373047, "logps/chosen": -534.645751953125, "logps/rejected": -1603.8359375, "loss": 0.1878, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.554718017578125, "rewards/margins": 10.739810943603516, "rewards/margins_max": 17.38308334350586, "rewards/margins_min": 4.096539497375488, "rewards/margins_std": 9.39500617980957, "rewards/rejected": -14.294529914855957, "step": 1330 }, { "epoch": 0.42, "grad_norm": 5.21875, "learning_rate": 3.579751409622103e-06, "logits/chosen": -1.5969122648239136, "logits/rejected": -1.3476945161819458, "logps/chosen": -617.2340087890625, "logps/rejected": -1609.4246826171875, "loss": 0.2865, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.8023934364318848, "rewards/margins": 10.177874565124512, "rewards/margins_max": 17.874292373657227, "rewards/margins_min": 2.481457471847534, "rewards/margins_std": 10.884378433227539, "rewards/rejected": -13.980267524719238, "step": 1340 }, { "epoch": 0.43, "grad_norm": 2.90625, "learning_rate": 3.5548838100178444e-06, "logits/chosen": -1.456305742263794, "logits/rejected": -1.257086157798767, "logps/chosen": -494.32501220703125, "logps/rejected": -1082.857666015625, "loss": 0.2634, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.0967798233032227, "rewards/margins": 5.911543846130371, "rewards/margins_max": 9.54675006866455, "rewards/margins_min": 2.2763376235961914, "rewards/margins_std": 5.140958309173584, "rewards/rejected": -9.008323669433594, "step": 1350 }, { "epoch": 0.43, "grad_norm": 2.84375, "learning_rate": 3.5298885713565615e-06, "logits/chosen": -1.380829095840454, "logits/rejected": -1.2045114040374756, "logps/chosen": -605.8043823242188, "logps/rejected": -1361.7166748046875, "loss": 0.1949, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.5279593467712402, "rewards/margins": 7.775490760803223, "rewards/margins_max": 13.313535690307617, "rewards/margins_min": 2.237445116043091, "rewards/margins_std": 7.831979274749756, "rewards/rejected": -11.303449630737305, "step": 1360 }, { "epoch": 0.43, "grad_norm": 1.1875, "learning_rate": 3.5047687180174905e-06, "logits/chosen": -1.5265122652053833, "logits/rejected": -1.1738016605377197, "logps/chosen": -613.3758544921875, "logps/rejected": -1357.464599609375, "loss": 0.1996, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.1165761947631836, "rewards/margins": 8.375921249389648, "rewards/margins_max": 13.726768493652344, "rewards/margins_min": 3.0250773429870605, "rewards/margins_std": 7.567238807678223, "rewards/rejected": -11.492499351501465, "step": 1370 }, { "epoch": 0.43, "grad_norm": 15.3125, "learning_rate": 3.4795272894580217e-06, "logits/chosen": -1.4896327257156372, "logits/rejected": -1.3400466442108154, "logps/chosen": -531.3592529296875, "logps/rejected": -1779.3931884765625, "loss": 0.3124, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.440082550048828, "rewards/margins": 12.141416549682617, "rewards/margins_max": 17.760726928710938, "rewards/margins_min": 6.522104740142822, "rewards/margins_std": 7.946906089782715, "rewards/rejected": -15.581499099731445, "step": 1380 }, { "epoch": 0.44, "grad_norm": 0.953125, "learning_rate": 3.454167339845932e-06, "logits/chosen": -1.4307299852371216, "logits/rejected": -1.3355329036712646, "logps/chosen": -553.9991455078125, "logps/rejected": -1710.6588134765625, "loss": 0.085, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.420407772064209, "rewards/margins": 11.416155815124512, "rewards/margins_max": 16.05166244506836, "rewards/margins_min": 6.780648231506348, "rewards/margins_std": 6.555596828460693, "rewards/rejected": -14.836563110351562, "step": 1390 }, { "epoch": 0.44, "grad_norm": 0.1328125, "learning_rate": 3.428691937689831e-06, "logits/chosen": -1.364752173423767, "logits/rejected": -1.174250841140747, "logps/chosen": -516.96142578125, "logps/rejected": -1582.386474609375, "loss": 0.1594, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.942873239517212, "rewards/margins": 10.622710227966309, "rewards/margins_max": 15.50335693359375, "rewards/margins_min": 5.742064476013184, "rewards/margins_std": 6.902276515960693, "rewards/rejected": -13.565584182739258, "step": 1400 }, { "epoch": 0.44, "grad_norm": 1.4140625, "learning_rate": 3.403104165467883e-06, "logits/chosen": -1.5384275913238525, "logits/rejected": -1.442461371421814, "logps/chosen": -617.8716430664062, "logps/rejected": -2171.876953125, "loss": 0.1215, "rewards/accuracies": 1.0, "rewards/chosen": -3.375886917114258, "rewards/margins": 15.94532299041748, "rewards/margins_max": 27.856884002685547, "rewards/margins_min": 4.0337605476379395, "rewards/margins_std": 16.845491409301758, "rewards/rejected": -19.32120704650879, "step": 1410 }, { "epoch": 0.45, "grad_norm": 3.484375, "learning_rate": 3.377407119254826e-06, "logits/chosen": -1.3978983163833618, "logits/rejected": -1.1405658721923828, "logps/chosen": -584.4593505859375, "logps/rejected": -1754.9351806640625, "loss": 0.157, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.1924796104431152, "rewards/margins": 12.185917854309082, "rewards/margins_max": 18.766172409057617, "rewards/margins_min": 5.605664253234863, "rewards/margins_std": 9.305885314941406, "rewards/rejected": -15.378397941589355, "step": 1420 }, { "epoch": 0.45, "grad_norm": 9.1875, "learning_rate": 3.3516039083473593e-06, "logits/chosen": -1.5193557739257812, "logits/rejected": -1.2308270931243896, "logps/chosen": -676.8104858398438, "logps/rejected": -1510.12744140625, "loss": 0.2207, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.3010759353637695, "rewards/margins": 8.718791961669922, "rewards/margins_max": 12.937799453735352, "rewards/margins_min": 4.49978494644165, "rewards/margins_std": 5.96657657623291, "rewards/rejected": -13.019868850708008, "step": 1430 }, { "epoch": 0.45, "grad_norm": 0.50390625, "learning_rate": 3.3256976548879183e-06, "logits/chosen": -1.6087383031845093, "logits/rejected": -1.409579873085022, "logps/chosen": -452.558349609375, "logps/rejected": -1753.7073974609375, "loss": 0.1952, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.7991509437561035, "rewards/margins": 12.75914478302002, "rewards/margins_max": 20.164968490600586, "rewards/margins_min": 5.3533196449279785, "rewards/margins_std": 10.473418235778809, "rewards/rejected": -15.558294296264648, "step": 1440 }, { "epoch": 0.46, "grad_norm": 1.375, "learning_rate": 3.299691493486904e-06, "logits/chosen": -1.553672194480896, "logits/rejected": -1.1765081882476807, "logps/chosen": -532.5276489257812, "logps/rejected": -1855.286376953125, "loss": 0.11, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.1610066890716553, "rewards/margins": 12.888555526733398, "rewards/margins_max": 20.415128707885742, "rewards/margins_min": 5.361984729766846, "rewards/margins_std": 10.644180297851562, "rewards/rejected": -16.049564361572266, "step": 1450 }, { "epoch": 0.46, "grad_norm": 1.2109375, "learning_rate": 3.2735885708433997e-06, "logits/chosen": -1.4375452995300293, "logits/rejected": -1.0696020126342773, "logps/chosen": -637.91552734375, "logps/rejected": -1755.896240234375, "loss": 0.2119, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.1430463790893555, "rewards/margins": 11.392419815063477, "rewards/margins_max": 16.991348266601562, "rewards/margins_min": 5.793490886688232, "rewards/margins_std": 7.918080806732178, "rewards/rejected": -15.5354642868042, "step": 1460 }, { "epoch": 0.46, "grad_norm": 0.98828125, "learning_rate": 3.247392045364426e-06, "logits/chosen": -1.4832890033721924, "logits/rejected": -1.4127947092056274, "logps/chosen": -557.3673095703125, "logps/rejected": -2154.6083984375, "loss": 0.1131, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.5359015464782715, "rewards/margins": 15.562028884887695, "rewards/margins_max": 23.592113494873047, "rewards/margins_min": 7.531941890716553, "rewards/margins_std": 11.356256484985352, "rewards/rejected": -19.097932815551758, "step": 1470 }, { "epoch": 0.47, "grad_norm": 1.0625, "learning_rate": 3.221105086782781e-06, "logits/chosen": -1.5462195873260498, "logits/rejected": -1.2428996562957764, "logps/chosen": -594.3450927734375, "logps/rejected": -2124.96728515625, "loss": 0.3074, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.730942964553833, "rewards/margins": 14.838473320007324, "rewards/margins_max": 22.921918869018555, "rewards/margins_min": 6.755026817321777, "rewards/margins_std": 11.431719779968262, "rewards/rejected": -18.569416046142578, "step": 1480 }, { "epoch": 0.47, "grad_norm": 4.34375, "learning_rate": 3.194730875773504e-06, "logits/chosen": -1.4843318462371826, "logits/rejected": -1.30060613155365, "logps/chosen": -617.7728271484375, "logps/rejected": -1552.475341796875, "loss": 0.2068, "rewards/accuracies": 0.875, "rewards/chosen": -3.88134503364563, "rewards/margins": 9.543024063110352, "rewards/margins_max": 15.337367057800293, "rewards/margins_min": 3.7486824989318848, "rewards/margins_std": 8.194437026977539, "rewards/rejected": -13.424371719360352, "step": 1490 }, { "epoch": 0.47, "grad_norm": 0.361328125, "learning_rate": 3.1682726035690254e-06, "logits/chosen": -1.5420567989349365, "logits/rejected": -1.08122718334198, "logps/chosen": -562.6282348632812, "logps/rejected": -1668.4222412109375, "loss": 0.0894, "rewards/accuracies": 1.0, "rewards/chosen": -3.0375943183898926, "rewards/margins": 11.751439094543457, "rewards/margins_max": 18.029438018798828, "rewards/margins_min": 5.473437309265137, "rewards/margins_std": 8.878433227539062, "rewards/rejected": -14.789031982421875, "step": 1500 }, { "epoch": 0.48, "grad_norm": 2.15625, "learning_rate": 3.141733471573026e-06, "logits/chosen": -1.4196301698684692, "logits/rejected": -1.1794580221176147, "logps/chosen": -638.9561767578125, "logps/rejected": -1795.17578125, "loss": 0.2731, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.6092400550842285, "rewards/margins": 12.275900840759277, "rewards/margins_max": 19.90064239501953, "rewards/margins_min": 4.651161193847656, "rewards/margins_std": 10.783010482788086, "rewards/rejected": -15.885139465332031, "step": 1510 }, { "epoch": 0.48, "grad_norm": 2.15625, "learning_rate": 3.1151166909730814e-06, "logits/chosen": -1.4170836210250854, "logits/rejected": -1.3289744853973389, "logps/chosen": -499.4686584472656, "logps/rejected": -1666.9635009765625, "loss": 0.3169, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.2561161518096924, "rewards/margins": 11.558661460876465, "rewards/margins_max": 19.94423484802246, "rewards/margins_min": 3.173088550567627, "rewards/margins_std": 11.858991622924805, "rewards/rejected": -14.814778327941895, "step": 1520 }, { "epoch": 0.48, "grad_norm": 0.7421875, "learning_rate": 3.0884254823521064e-06, "logits/chosen": -1.4527684450149536, "logits/rejected": -1.1548287868499756, "logps/chosen": -428.9183044433594, "logps/rejected": -1450.4156494140625, "loss": 0.2553, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.477548599243164, "rewards/margins": 10.415294647216797, "rewards/margins_max": 16.25199317932129, "rewards/margins_min": 4.578597068786621, "rewards/margins_std": 8.254335403442383, "rewards/rejected": -12.892842292785645, "step": 1530 }, { "epoch": 0.49, "grad_norm": 1.1875, "learning_rate": 3.0616630752986755e-06, "logits/chosen": -1.6659702062606812, "logits/rejected": -1.3601497411727905, "logps/chosen": -599.8160400390625, "logps/rejected": -1705.684326171875, "loss": 0.1134, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.4542229175567627, "rewards/margins": 10.894365310668945, "rewards/margins_max": 16.569028854370117, "rewards/margins_min": 5.219702243804932, "rewards/margins_std": 8.025186538696289, "rewards/rejected": -14.348589897155762, "step": 1540 }, { "epoch": 0.49, "grad_norm": 5.71875, "learning_rate": 3.0348327080162438e-06, "logits/chosen": -1.650661826133728, "logits/rejected": -1.2949879169464111, "logps/chosen": -613.9927978515625, "logps/rejected": -1550.9993896484375, "loss": 0.1008, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.4950804710388184, "rewards/margins": 9.927111625671387, "rewards/margins_max": 15.52288818359375, "rewards/margins_min": 4.331338405609131, "rewards/margins_std": 7.9136199951171875, "rewards/rejected": -13.422192573547363, "step": 1550 }, { "epoch": 0.49, "grad_norm": 1.25, "learning_rate": 3.007937626931336e-06, "logits/chosen": -1.5543906688690186, "logits/rejected": -1.2956621646881104, "logps/chosen": -451.51629638671875, "logps/rejected": -1468.626708984375, "loss": 0.247, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.4063615798950195, "rewards/margins": 9.632174491882324, "rewards/margins_max": 15.11768913269043, "rewards/margins_min": 4.146659851074219, "rewards/margins_std": 7.757689476013184, "rewards/rejected": -12.038535118103027, "step": 1560 }, { "epoch": 0.49, "grad_norm": 3.25, "learning_rate": 2.9809810863007286e-06, "logits/chosen": -1.581876516342163, "logits/rejected": -1.253678560256958, "logps/chosen": -515.3495483398438, "logps/rejected": -1483.419677734375, "loss": 0.1777, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.1044821739196777, "rewards/margins": 9.645689010620117, "rewards/margins_max": 15.521951675415039, "rewards/margins_min": 3.7694268226623535, "rewards/margins_std": 8.31028938293457, "rewards/rejected": -12.750170707702637, "step": 1570 }, { "epoch": 0.5, "grad_norm": 6.84375, "learning_rate": 2.953966347817695e-06, "logits/chosen": -1.4079358577728271, "logits/rejected": -1.3491824865341187, "logps/chosen": -496.98394775390625, "logps/rejected": -1792.646484375, "loss": 0.1523, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.880263566970825, "rewards/margins": 12.569089889526367, "rewards/margins_max": 21.626419067382812, "rewards/margins_min": 3.5117599964141846, "rewards/margins_std": 12.808998107910156, "rewards/rejected": -15.449353218078613, "step": 1580 }, { "epoch": 0.5, "grad_norm": 11.375, "learning_rate": 2.9268966802173437e-06, "logits/chosen": -1.4724808931350708, "logits/rejected": -1.126155138015747, "logps/chosen": -572.5735473632812, "logps/rejected": -1458.450439453125, "loss": 0.1156, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.984464406967163, "rewards/margins": 9.40550422668457, "rewards/margins_max": 14.413080215454102, "rewards/margins_min": 4.397928237915039, "rewards/margins_std": 7.08178186416626, "rewards/rejected": -12.389968872070312, "step": 1590 }, { "epoch": 0.5, "grad_norm": 18.25, "learning_rate": 2.89977535888111e-06, "logits/chosen": -1.4884029626846313, "logits/rejected": -1.2525956630706787, "logps/chosen": -469.20220947265625, "logps/rejected": -1566.2236328125, "loss": 0.1926, "rewards/accuracies": 1.0, "rewards/chosen": -2.888575792312622, "rewards/margins": 10.976606369018555, "rewards/margins_max": 17.656143188476562, "rewards/margins_min": 4.297070503234863, "rewards/margins_std": 9.446290969848633, "rewards/rejected": -13.865182876586914, "step": 1600 }, { "epoch": 0.51, "grad_norm": 29.375, "learning_rate": 2.872605665440436e-06, "logits/chosen": -1.493099331855774, "logits/rejected": -1.4659028053283691, "logps/chosen": -537.7650146484375, "logps/rejected": -2078.65869140625, "loss": 0.3169, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.645385265350342, "rewards/margins": 14.931035041809082, "rewards/margins_max": 23.03750991821289, "rewards/margins_min": 6.824557304382324, "rewards/margins_std": 11.464289665222168, "rewards/rejected": -18.576419830322266, "step": 1610 }, { "epoch": 0.51, "grad_norm": 0.66015625, "learning_rate": 2.845390887379706e-06, "logits/chosen": -1.6047687530517578, "logits/rejected": -1.345473051071167, "logps/chosen": -587.2362670898438, "logps/rejected": -1573.9320068359375, "loss": 0.3874, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.5847039222717285, "rewards/margins": 10.173007011413574, "rewards/margins_max": 17.029922485351562, "rewards/margins_min": 3.3160948753356934, "rewards/margins_std": 9.697138786315918, "rewards/rejected": -13.757711410522461, "step": 1620 }, { "epoch": 0.51, "grad_norm": 0.73828125, "learning_rate": 2.818134317638459e-06, "logits/chosen": -1.3401492834091187, "logits/rejected": -1.3071324825286865, "logps/chosen": -535.4603881835938, "logps/rejected": -1909.8978271484375, "loss": 0.2024, "rewards/accuracies": 0.875, "rewards/chosen": -3.384225368499756, "rewards/margins": 12.386846542358398, "rewards/margins_max": 18.520631790161133, "rewards/margins_min": 6.253061771392822, "rewards/margins_std": 8.674481391906738, "rewards/rejected": -15.77107048034668, "step": 1630 }, { "epoch": 0.52, "grad_norm": 2.125, "learning_rate": 2.790839254212954e-06, "logits/chosen": -1.618775725364685, "logits/rejected": -1.407546043395996, "logps/chosen": -557.8433837890625, "logps/rejected": -1905.104248046875, "loss": 0.1552, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2777836322784424, "rewards/margins": 13.154513359069824, "rewards/margins_max": 20.147037506103516, "rewards/margins_min": 6.161989212036133, "rewards/margins_std": 9.888921737670898, "rewards/rejected": -16.432294845581055, "step": 1640 }, { "epoch": 0.52, "grad_norm": 1.1328125, "learning_rate": 2.7635089997571196e-06, "logits/chosen": -1.5546470880508423, "logits/rejected": -1.463314175605774, "logps/chosen": -512.0051879882812, "logps/rejected": -1762.7623291015625, "loss": 0.1786, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.9287774562835693, "rewards/margins": 11.732918739318848, "rewards/margins_max": 18.208843231201172, "rewards/margins_min": 5.256992816925049, "rewards/margins_std": 9.158343315124512, "rewards/rejected": -14.66169548034668, "step": 1650 }, { "epoch": 0.52, "grad_norm": 7.59375, "learning_rate": 2.736146861182933e-06, "logits/chosen": -1.6997873783111572, "logits/rejected": -1.4095062017440796, "logps/chosen": -528.2429809570312, "logps/rejected": -1602.039306640625, "loss": 0.1509, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.242964506149292, "rewards/margins": 10.520895004272461, "rewards/margins_max": 17.078807830810547, "rewards/margins_min": 3.962982654571533, "rewards/margins_std": 9.27428913116455, "rewards/rejected": -13.763860702514648, "step": 1660 }, { "epoch": 0.53, "grad_norm": 11.875, "learning_rate": 2.7087561492602927e-06, "logits/chosen": -1.5607562065124512, "logits/rejected": -1.209094762802124, "logps/chosen": -582.0310668945312, "logps/rejected": -1421.4693603515625, "loss": 0.2018, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.435152769088745, "rewards/margins": 8.758903503417969, "rewards/margins_max": 13.951889038085938, "rewards/margins_min": 3.565919876098633, "rewards/margins_std": 7.34398889541626, "rewards/rejected": -12.194056510925293, "step": 1670 }, { "epoch": 0.53, "grad_norm": 0.82421875, "learning_rate": 2.681340178216423e-06, "logits/chosen": -1.8068653345108032, "logits/rejected": -1.5372353792190552, "logps/chosen": -536.0225830078125, "logps/rejected": -1841.3037109375, "loss": 0.1502, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.9493765830993652, "rewards/margins": 12.962008476257324, "rewards/margins_max": 18.436702728271484, "rewards/margins_min": 7.487314701080322, "rewards/margins_std": 7.742384433746338, "rewards/rejected": -15.911382675170898, "step": 1680 }, { "epoch": 0.53, "grad_norm": 1.7109375, "learning_rate": 2.6539022653348577e-06, "logits/chosen": -1.4544528722763062, "logits/rejected": -1.2387598752975464, "logps/chosen": -541.6769409179688, "logps/rejected": -2221.091796875, "loss": 0.2307, "rewards/accuracies": 1.0, "rewards/chosen": -3.3415286540985107, "rewards/margins": 16.246158599853516, "rewards/margins_max": 25.676050186157227, "rewards/margins_min": 6.8162641525268555, "rewards/margins_std": 13.335882186889648, "rewards/rejected": -19.58768653869629, "step": 1690 }, { "epoch": 0.54, "grad_norm": 13.0625, "learning_rate": 2.6264457305540502e-06, "logits/chosen": -1.7259023189544678, "logits/rejected": -1.4031670093536377, "logps/chosen": -525.0916748046875, "logps/rejected": -1967.254150390625, "loss": 0.2108, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.277066707611084, "rewards/margins": 14.416677474975586, "rewards/margins_max": 25.7095947265625, "rewards/margins_min": 3.1237592697143555, "rewards/margins_std": 15.970598220825195, "rewards/rejected": -17.693744659423828, "step": 1700 }, { "epoch": 0.54, "grad_norm": 0.625, "learning_rate": 2.598973896065674e-06, "logits/chosen": -1.2255192995071411, "logits/rejected": -1.1902718544006348, "logps/chosen": -627.9791870117188, "logps/rejected": -2590.4423828125, "loss": 0.1097, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.78948712348938, "rewards/margins": 19.36321258544922, "rewards/margins_max": 30.17099952697754, "rewards/margins_min": 8.555425643920898, "rewards/margins_std": 15.284518241882324, "rewards/rejected": -23.152698516845703, "step": 1710 }, { "epoch": 0.54, "grad_norm": 17.125, "learning_rate": 2.571490085912638e-06, "logits/chosen": -1.4462924003601074, "logits/rejected": -1.1185009479522705, "logps/chosen": -672.9526977539062, "logps/rejected": -1587.697021484375, "loss": 0.2161, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.46766996383667, "rewards/margins": 9.219390869140625, "rewards/margins_max": 12.912025451660156, "rewards/margins_min": 5.526756763458252, "rewards/margins_std": 5.222173690795898, "rewards/rejected": -13.68706226348877, "step": 1720 }, { "epoch": 0.54, "grad_norm": 1.2734375, "learning_rate": 2.543997625586885e-06, "logits/chosen": -1.4518120288848877, "logits/rejected": -1.2286278009414673, "logps/chosen": -543.7399291992188, "logps/rejected": -2093.8134765625, "loss": 0.1356, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.394313097000122, "rewards/margins": 14.934762954711914, "rewards/margins_max": 24.45358657836914, "rewards/margins_min": 5.415938377380371, "rewards/margins_std": 13.461651802062988, "rewards/rejected": -18.329076766967773, "step": 1730 }, { "epoch": 0.55, "grad_norm": 3.921875, "learning_rate": 2.516499841627014e-06, "logits/chosen": -1.6292340755462646, "logits/rejected": -1.4900437593460083, "logps/chosen": -599.104248046875, "logps/rejected": -1993.777099609375, "loss": 0.2569, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.7089314460754395, "rewards/margins": 13.895370483398438, "rewards/margins_max": 23.25143814086914, "rewards/margins_min": 4.539303779602051, "rewards/margins_std": 13.231475830078125, "rewards/rejected": -17.60430335998535, "step": 1740 }, { "epoch": 0.55, "grad_norm": 10.5625, "learning_rate": 2.4890000612157748e-06, "logits/chosen": -1.5528570413589478, "logits/rejected": -1.3459594249725342, "logps/chosen": -668.9677124023438, "logps/rejected": -1671.044189453125, "loss": 0.2611, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.5307769775390625, "rewards/margins": 10.027362823486328, "rewards/margins_max": 18.219478607177734, "rewards/margins_min": 1.835247278213501, "rewards/margins_std": 11.585399627685547, "rewards/rejected": -14.558140754699707, "step": 1750 }, { "epoch": 0.55, "grad_norm": 2.078125, "learning_rate": 2.4615016117774836e-06, "logits/chosen": -1.4520705938339233, "logits/rejected": -1.242058515548706, "logps/chosen": -517.4442138671875, "logps/rejected": -1405.398681640625, "loss": 0.1668, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.164559841156006, "rewards/margins": 8.761804580688477, "rewards/margins_max": 14.644865036010742, "rewards/margins_min": 2.8787448406219482, "rewards/margins_std": 8.319904327392578, "rewards/rejected": -11.926366806030273, "step": 1760 }, { "epoch": 0.56, "grad_norm": 2.015625, "learning_rate": 2.43400782057541e-06, "logits/chosen": -1.5988481044769287, "logits/rejected": -1.2858158349990845, "logps/chosen": -585.8101196289062, "logps/rejected": -1589.328125, "loss": 0.1983, "rewards/accuracies": 0.875, "rewards/chosen": -3.538374423980713, "rewards/margins": 9.93020248413086, "rewards/margins_max": 15.531498908996582, "rewards/margins_min": 4.328904628753662, "rewards/margins_std": 7.921429634094238, "rewards/rejected": -13.46857738494873, "step": 1770 }, { "epoch": 0.56, "grad_norm": 2.015625, "learning_rate": 2.4065220143091863e-06, "logits/chosen": -1.4877874851226807, "logits/rejected": -1.2298099994659424, "logps/chosen": -494.2522888183594, "logps/rejected": -1277.85107421875, "loss": 0.139, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.7383546829223633, "rewards/margins": 7.873937129974365, "rewards/margins_max": 11.741680145263672, "rewards/margins_min": 4.006192684173584, "rewards/margins_std": 5.469815731048584, "rewards/rejected": -10.61229133605957, "step": 1780 }, { "epoch": 0.56, "grad_norm": 1.7265625, "learning_rate": 2.3790475187122838e-06, "logits/chosen": -1.488873839378357, "logits/rejected": -1.2716490030288696, "logps/chosen": -497.75762939453125, "logps/rejected": -1593.458984375, "loss": 0.1684, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.9146790504455566, "rewards/margins": 11.19676399230957, "rewards/margins_max": 18.476064682006836, "rewards/margins_min": 3.9174625873565674, "rewards/margins_std": 10.294486045837402, "rewards/rejected": -14.111444473266602, "step": 1790 }, { "epoch": 0.57, "grad_norm": 1.9140625, "learning_rate": 2.3515876581495983e-06, "logits/chosen": -1.5976613759994507, "logits/rejected": -1.1677883863449097, "logps/chosen": -589.4625244140625, "logps/rejected": -1531.7314453125, "loss": 0.1462, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.779262065887451, "rewards/margins": 9.616241455078125, "rewards/margins_max": 15.82532787322998, "rewards/margins_min": 3.407156467437744, "rewards/margins_std": 8.780973434448242, "rewards/rejected": -12.395503044128418, "step": 1800 }, { "epoch": 0.57, "grad_norm": 0.33203125, "learning_rate": 2.3241457552152188e-06, "logits/chosen": -1.4134392738342285, "logits/rejected": -1.0529563426971436, "logps/chosen": -578.8505249023438, "logps/rejected": -1768.5361328125, "loss": 0.1572, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.193676471710205, "rewards/margins": 12.612680435180664, "rewards/margins_max": 20.445871353149414, "rewards/margins_min": 4.779488563537598, "rewards/margins_std": 11.077805519104004, "rewards/rejected": -15.806355476379395, "step": 1810 }, { "epoch": 0.57, "grad_norm": 1.15625, "learning_rate": 2.2967251303303876e-06, "logits/chosen": -1.4060922861099243, "logits/rejected": -1.2433264255523682, "logps/chosen": -471.23370361328125, "logps/rejected": -1373.26708984375, "loss": 0.3607, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.941974639892578, "rewards/margins": 8.82459831237793, "rewards/margins_max": 14.054105758666992, "rewards/margins_min": 3.59509015083313, "rewards/margins_std": 7.395641326904297, "rewards/rejected": -11.766572952270508, "step": 1820 }, { "epoch": 0.58, "grad_norm": 5.03125, "learning_rate": 2.269329101341745e-06, "logits/chosen": -1.555955171585083, "logits/rejected": -1.3775891065597534, "logps/chosen": -484.1917419433594, "logps/rejected": -1392.3919677734375, "loss": 0.1535, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.837075710296631, "rewards/margins": 8.895866394042969, "rewards/margins_max": 12.957498550415039, "rewards/margins_min": 4.834234237670898, "rewards/margins_std": 5.744015216827393, "rewards/rejected": -11.732942581176758, "step": 1830 }, { "epoch": 0.58, "grad_norm": 0.55859375, "learning_rate": 2.2419609831198695e-06, "logits/chosen": -1.4280954599380493, "logits/rejected": -1.3863157033920288, "logps/chosen": -489.63812255859375, "logps/rejected": -2068.576904296875, "loss": 0.2023, "rewards/accuracies": 1.0, "rewards/chosen": -2.8380837440490723, "rewards/margins": 15.058830261230469, "rewards/margins_max": 23.17379379272461, "rewards/margins_min": 6.943869113922119, "rewards/margins_std": 11.476289749145508, "rewards/rejected": -17.896915435791016, "step": 1840 }, { "epoch": 0.58, "grad_norm": 1.5703125, "learning_rate": 2.214624087158188e-06, "logits/chosen": -1.5892161130905151, "logits/rejected": -1.3412600755691528, "logps/chosen": -600.9816284179688, "logps/rejected": -1814.3232421875, "loss": 0.3013, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.3865344524383545, "rewards/margins": 11.768966674804688, "rewards/margins_max": 18.325162887573242, "rewards/margins_min": 5.212772846221924, "rewards/margins_std": 9.271859169006348, "rewards/rejected": -15.155502319335938, "step": 1850 }, { "epoch": 0.59, "grad_norm": 1.09375, "learning_rate": 2.1873217211722883e-06, "logits/chosen": -1.41153883934021, "logits/rejected": -1.1877845525741577, "logps/chosen": -477.3059997558594, "logps/rejected": -1664.2574462890625, "loss": 0.2323, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.7079248428344727, "rewards/margins": 12.086627006530762, "rewards/margins_max": 20.110389709472656, "rewards/margins_min": 4.062866687774658, "rewards/margins_std": 11.347312927246094, "rewards/rejected": -14.794553756713867, "step": 1860 }, { "epoch": 0.59, "grad_norm": 0.78125, "learning_rate": 2.1600571886996937e-06, "logits/chosen": -1.5755198001861572, "logits/rejected": -1.1792980432510376, "logps/chosen": -515.0977172851562, "logps/rejected": -1678.0728759765625, "loss": 0.1313, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.564347982406616, "rewards/margins": 11.869641304016113, "rewards/margins_max": 18.997394561767578, "rewards/margins_min": 4.74188756942749, "rewards/margins_std": 10.080163955688477, "rewards/rejected": -14.433990478515625, "step": 1870 }, { "epoch": 0.59, "grad_norm": 9.5625, "learning_rate": 2.1328337887001387e-06, "logits/chosen": -1.5203434228897095, "logits/rejected": -1.1894242763519287, "logps/chosen": -681.9379272460938, "logps/rejected": -1993.7366943359375, "loss": 0.087, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.278992652893066, "rewards/margins": 13.553190231323242, "rewards/margins_max": 20.062623977661133, "rewards/margins_min": 7.043754577636719, "rewards/margins_std": 9.205732345581055, "rewards/rejected": -17.832181930541992, "step": 1880 }, { "epoch": 0.6, "grad_norm": 0.15625, "learning_rate": 2.1056548151564064e-06, "logits/chosen": -1.4046363830566406, "logits/rejected": -1.1018089056015015, "logps/chosen": -512.6920776367188, "logps/rejected": -1592.6275634765625, "loss": 0.1041, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.973428726196289, "rewards/margins": 10.557977676391602, "rewards/margins_max": 15.843744277954102, "rewards/margins_min": 5.272213935852051, "rewards/margins_std": 7.4751996994018555, "rewards/rejected": -13.531407356262207, "step": 1890 }, { "epoch": 0.6, "grad_norm": 12.4375, "learning_rate": 2.078523556675752e-06, "logits/chosen": -1.666265845298767, "logits/rejected": -1.2846992015838623, "logps/chosen": -642.2943725585938, "logps/rejected": -1613.908203125, "loss": 0.2526, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.648852825164795, "rewards/margins": 9.809070587158203, "rewards/margins_max": 15.258506774902344, "rewards/margins_min": 4.359635353088379, "rewards/margins_std": 7.706665992736816, "rewards/rejected": -13.457923889160156, "step": 1900 }, { "epoch": 0.6, "grad_norm": 0.23828125, "learning_rate": 2.051443296091998e-06, "logits/chosen": -1.4331845045089722, "logits/rejected": -1.0446020364761353, "logps/chosen": -584.4759521484375, "logps/rejected": -1261.5855712890625, "loss": 0.0972, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.0501112937927246, "rewards/margins": 7.305967807769775, "rewards/margins_max": 10.653976440429688, "rewards/margins_min": 3.957958698272705, "rewards/margins_std": 4.734799385070801, "rewards/rejected": -10.3560791015625, "step": 1910 }, { "epoch": 0.6, "grad_norm": 0.46875, "learning_rate": 2.0244173100683093e-06, "logits/chosen": -1.473258376121521, "logits/rejected": -1.2557618618011475, "logps/chosen": -553.75048828125, "logps/rejected": -1824.484375, "loss": 0.1291, "rewards/accuracies": 1.0, "rewards/chosen": -3.070702075958252, "rewards/margins": 12.991300582885742, "rewards/margins_max": 20.931209564208984, "rewards/margins_min": 5.051392555236816, "rewards/margins_std": 11.228726387023926, "rewards/rejected": -16.062002182006836, "step": 1920 }, { "epoch": 0.61, "grad_norm": 0.9609375, "learning_rate": 1.9974488687007274e-06, "logits/chosen": -1.3898913860321045, "logits/rejected": -1.1323245763778687, "logps/chosen": -500.98419189453125, "logps/rejected": -1475.350341796875, "loss": 0.0983, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.0772461891174316, "rewards/margins": 9.603334426879883, "rewards/margins_max": 14.657646179199219, "rewards/margins_min": 4.549025058746338, "rewards/margins_std": 7.147873878479004, "rewards/rejected": -12.680582046508789, "step": 1930 }, { "epoch": 0.61, "grad_norm": 1.890625, "learning_rate": 1.970541235122494e-06, "logits/chosen": -1.4766249656677246, "logits/rejected": -1.251630187034607, "logps/chosen": -529.0836181640625, "logps/rejected": -1698.4908447265625, "loss": 0.2151, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.6294097900390625, "rewards/margins": 12.28985595703125, "rewards/margins_max": 19.14556121826172, "rewards/margins_min": 5.434152603149414, "rewards/margins_std": 9.695430755615234, "rewards/rejected": -14.919267654418945, "step": 1940 }, { "epoch": 0.61, "grad_norm": 2.03125, "learning_rate": 1.9436976651092143e-06, "logits/chosen": -1.5825145244598389, "logits/rejected": -1.2321441173553467, "logps/chosen": -616.10888671875, "logps/rejected": -1734.438232421875, "loss": 0.1443, "rewards/accuracies": 1.0, "rewards/chosen": -2.8859734535217285, "rewards/margins": 11.892366409301758, "rewards/margins_max": 16.552751541137695, "rewards/margins_min": 7.231983184814453, "rewards/margins_std": 6.590777397155762, "rewards/rejected": -14.778340339660645, "step": 1950 }, { "epoch": 0.62, "grad_norm": 14.125, "learning_rate": 1.91692140668492e-06, "logits/chosen": -1.4896900653839111, "logits/rejected": -1.1990612745285034, "logps/chosen": -513.3895263671875, "logps/rejected": -1140.274169921875, "loss": 0.3986, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.0260584354400635, "rewards/margins": 6.219557285308838, "rewards/margins_max": 10.312917709350586, "rewards/margins_min": 2.126194477081299, "rewards/margins_std": 5.788887977600098, "rewards/rejected": -9.245615005493164, "step": 1960 }, { "epoch": 0.62, "grad_norm": 6.53125, "learning_rate": 1.8902156997290571e-06, "logits/chosen": -1.4913126230239868, "logits/rejected": -1.1569576263427734, "logps/chosen": -536.8547973632812, "logps/rejected": -1584.836669921875, "loss": 0.1117, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.1256766319274902, "rewards/margins": 10.562490463256836, "rewards/margins_max": 15.560125350952148, "rewards/margins_min": 5.564853668212891, "rewards/margins_std": 7.067723751068115, "rewards/rejected": -13.6881685256958, "step": 1970 }, { "epoch": 0.62, "grad_norm": 3.984375, "learning_rate": 1.8635837755844739e-06, "logits/chosen": -1.7155044078826904, "logits/rejected": -1.3497909307479858, "logps/chosen": -462.90716552734375, "logps/rejected": -1425.9698486328125, "loss": 0.1518, "rewards/accuracies": 1.0, "rewards/chosen": -2.666769027709961, "rewards/margins": 9.71937084197998, "rewards/margins_max": 15.215265274047852, "rewards/margins_min": 4.223477363586426, "rewards/margins_std": 7.77236795425415, "rewards/rejected": -12.386140823364258, "step": 1980 }, { "epoch": 0.63, "grad_norm": 1.7890625, "learning_rate": 1.8370288566664263e-06, "logits/chosen": -1.5542620420455933, "logits/rejected": -1.303271770477295, "logps/chosen": -538.37255859375, "logps/rejected": -1543.0211181640625, "loss": 0.1435, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.1137211322784424, "rewards/margins": 10.367592811584473, "rewards/margins_max": 16.86190414428711, "rewards/margins_min": 3.873281478881836, "rewards/margins_std": 9.184343338012695, "rewards/rejected": -13.481313705444336, "step": 1990 }, { "epoch": 0.63, "grad_norm": 4.5625, "learning_rate": 1.8105541560726786e-06, "logits/chosen": -1.6579090356826782, "logits/rejected": -1.2486917972564697, "logps/chosen": -563.3258666992188, "logps/rejected": -1781.8521728515625, "loss": 0.1214, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.4339842796325684, "rewards/margins": 12.411375045776367, "rewards/margins_max": 17.969219207763672, "rewards/margins_min": 6.8535261154174805, "rewards/margins_std": 7.859982967376709, "rewards/rejected": -15.845357894897461, "step": 2000 }, { "epoch": 0.63, "grad_norm": 3.859375, "learning_rate": 1.784162877194719e-06, "logits/chosen": -1.5229161977767944, "logits/rejected": -1.1513035297393799, "logps/chosen": -524.3636474609375, "logps/rejected": -1594.617919921875, "loss": 0.1073, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.86541748046875, "rewards/margins": 11.076706886291504, "rewards/margins_max": 16.567943572998047, "rewards/margins_min": 5.585470199584961, "rewards/margins_std": 7.765781402587891, "rewards/rejected": -13.94212532043457, "step": 2010 }, { "epoch": 0.64, "grad_norm": 11.25, "learning_rate": 1.7578582133301572e-06, "logits/chosen": -1.3213260173797607, "logits/rejected": -1.2071130275726318, "logps/chosen": -654.8223266601562, "logps/rejected": -2169.09228515625, "loss": 0.1845, "rewards/accuracies": 0.875, "rewards/chosen": -4.209962368011475, "rewards/margins": 14.692832946777344, "rewards/margins_max": 24.164161682128906, "rewards/margins_min": 5.221506118774414, "rewards/margins_std": 13.394479751586914, "rewards/rejected": -18.90279769897461, "step": 2020 }, { "epoch": 0.64, "grad_norm": 12.9375, "learning_rate": 1.7316433472963428e-06, "logits/chosen": -1.656837821006775, "logits/rejected": -1.4358789920806885, "logps/chosen": -638.5806884765625, "logps/rejected": -1915.6451416015625, "loss": 0.1221, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.5313782691955566, "rewards/margins": 13.209005355834961, "rewards/margins_max": 20.33796501159668, "rewards/margins_min": 6.080048561096191, "rewards/margins_std": 10.081869125366211, "rewards/rejected": -16.74038314819336, "step": 2030 }, { "epoch": 0.64, "grad_norm": 4.34375, "learning_rate": 1.7055214510452462e-06, "logits/chosen": -1.490957498550415, "logits/rejected": -1.1374667882919312, "logps/chosen": -700.1129760742188, "logps/rejected": -2816.78173828125, "loss": 0.1063, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6439292430877686, "rewards/margins": 21.763607025146484, "rewards/margins_max": 35.701961517333984, "rewards/margins_min": 7.825249671936035, "rewards/margins_std": 19.711811065673828, "rewards/rejected": -25.407535552978516, "step": 2040 }, { "epoch": 0.65, "grad_norm": 4.0, "learning_rate": 1.6794956852796618e-06, "logits/chosen": -1.5615451335906982, "logits/rejected": -1.3411719799041748, "logps/chosen": -565.6097412109375, "logps/rejected": -1728.943359375, "loss": 0.0938, "rewards/accuracies": 1.0, "rewards/chosen": -3.4684576988220215, "rewards/margins": 11.626744270324707, "rewards/margins_max": 18.29702377319336, "rewards/margins_min": 4.956464767456055, "rewards/margins_std": 9.43320083618164, "rewards/rejected": -15.095202445983887, "step": 2050 }, { "epoch": 0.65, "grad_norm": 13.5625, "learning_rate": 1.6535691990707642e-06, "logits/chosen": -1.5745326280593872, "logits/rejected": -1.2651867866516113, "logps/chosen": -564.30224609375, "logps/rejected": -2381.66162109375, "loss": 0.2089, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.5344910621643066, "rewards/margins": 17.981985092163086, "rewards/margins_max": 29.834253311157227, "rewards/margins_min": 6.129714488983154, "rewards/margins_std": 16.761640548706055, "rewards/rejected": -21.5164737701416, "step": 2060 }, { "epoch": 0.65, "grad_norm": 0.287109375, "learning_rate": 1.6277451294770835e-06, "logits/chosen": -1.5891001224517822, "logits/rejected": -1.2852892875671387, "logps/chosen": -548.130615234375, "logps/rejected": -1428.930419921875, "loss": 0.1449, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.710313320159912, "rewards/margins": 8.998636245727539, "rewards/margins_max": 14.026883125305176, "rewards/margins_min": 3.9703896045684814, "rewards/margins_std": 7.111013889312744, "rewards/rejected": -12.708948135375977, "step": 2070 }, { "epoch": 0.66, "grad_norm": 4.84375, "learning_rate": 1.6020266011649176e-06, "logits/chosen": -1.4778506755828857, "logits/rejected": -1.1584922075271606, "logps/chosen": -663.0747680664062, "logps/rejected": -1830.345947265625, "loss": 0.1917, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.136785507202148, "rewards/margins": 11.866098403930664, "rewards/margins_max": 20.245691299438477, "rewards/margins_min": 3.486506223678589, "rewards/margins_std": 11.850533485412598, "rewards/rejected": -16.002885818481445, "step": 2080 }, { "epoch": 0.66, "grad_norm": 0.01287841796875, "learning_rate": 1.576416726030261e-06, "logits/chosen": -1.3806267976760864, "logits/rejected": -1.3429162502288818, "logps/chosen": -581.0066528320312, "logps/rejected": -2020.9459228515625, "loss": 0.13, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.6570827960968018, "rewards/margins": 13.965600967407227, "rewards/margins_max": 22.282276153564453, "rewards/margins_min": 5.648922920227051, "rewards/margins_std": 11.761558532714844, "rewards/rejected": -17.622684478759766, "step": 2090 }, { "epoch": 0.66, "grad_norm": 0.578125, "learning_rate": 1.5509186028222657e-06, "logits/chosen": -1.4945251941680908, "logits/rejected": -1.123812198638916, "logps/chosen": -578.8905639648438, "logps/rejected": -2480.041259765625, "loss": 0.1408, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.3431968688964844, "rewards/margins": 19.43141746520996, "rewards/margins_max": 33.1664924621582, "rewards/margins_min": 5.696345806121826, "rewards/margins_std": 19.424325942993164, "rewards/rejected": -22.774616241455078, "step": 2100 }, { "epoch": 0.66, "grad_norm": 16.125, "learning_rate": 1.5255353167683017e-06, "logits/chosen": -1.5955666303634644, "logits/rejected": -1.2716028690338135, "logps/chosen": -535.6474609375, "logps/rejected": -1721.0478515625, "loss": 0.1891, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.3446204662323, "rewards/margins": 11.946348190307617, "rewards/margins_max": 19.05233383178711, "rewards/margins_min": 4.840361595153809, "rewards/margins_std": 10.049383163452148, "rewards/rejected": -15.290969848632812, "step": 2110 }, { "epoch": 0.67, "grad_norm": 1.40625, "learning_rate": 1.500269939200648e-06, "logits/chosen": -1.5924263000488281, "logits/rejected": -1.3786189556121826, "logps/chosen": -581.8662109375, "logps/rejected": -1422.6884765625, "loss": 0.1991, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.9787490367889404, "rewards/margins": 8.33303165435791, "rewards/margins_max": 13.738202095031738, "rewards/margins_min": 2.9278597831726074, "rewards/margins_std": 7.644065856933594, "rewards/rejected": -12.31178092956543, "step": 2120 }, { "epoch": 0.67, "grad_norm": 10.9375, "learning_rate": 1.4751255271848665e-06, "logits/chosen": -1.540938138961792, "logits/rejected": -1.3453645706176758, "logps/chosen": -561.5753173828125, "logps/rejected": -1571.74169921875, "loss": 0.1027, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.668487071990967, "rewards/margins": 9.975430488586426, "rewards/margins_max": 15.654937744140625, "rewards/margins_min": 4.29592227935791, "rewards/margins_std": 8.032035827636719, "rewards/rejected": -13.643916130065918, "step": 2130 }, { "epoch": 0.67, "grad_norm": 0.6640625, "learning_rate": 1.4501051231499042e-06, "logits/chosen": -1.518028974533081, "logits/rejected": -1.1507012844085693, "logps/chosen": -533.4281005859375, "logps/rejected": -1911.723388671875, "loss": 0.1175, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.922112464904785, "rewards/margins": 13.367526054382324, "rewards/margins_max": 23.11964225769043, "rewards/margins_min": 3.615410566329956, "rewards/margins_std": 13.79157543182373, "rewards/rejected": -16.28963851928711, "step": 2140 }, { "epoch": 0.68, "grad_norm": 1.296875, "learning_rate": 1.4252117545199639e-06, "logits/chosen": -1.3974970579147339, "logits/rejected": -1.500832438468933, "logps/chosen": -433.35089111328125, "logps/rejected": -1680.6429443359375, "loss": 0.274, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.0182032585144043, "rewards/margins": 11.945356369018555, "rewards/margins_max": 20.45294761657715, "rewards/margins_min": 3.4377639293670654, "rewards/margins_std": 12.0315523147583, "rewards/rejected": -14.9635591506958, "step": 2150 }, { "epoch": 0.68, "grad_norm": 0.0078125, "learning_rate": 1.4004484333481911e-06, "logits/chosen": -1.5064579248428345, "logits/rejected": -1.2587788105010986, "logps/chosen": -525.6350708007812, "logps/rejected": -1384.8411865234375, "loss": 0.2889, "rewards/accuracies": 0.875, "rewards/chosen": -3.4120688438415527, "rewards/margins": 8.54577922821045, "rewards/margins_max": 13.886482238769531, "rewards/margins_min": 3.205078125, "rewards/margins_std": 7.552893161773682, "rewards/rejected": -11.957849502563477, "step": 2160 }, { "epoch": 0.68, "grad_norm": 4.875, "learning_rate": 1.375818155952222e-06, "logits/chosen": -1.511816382408142, "logits/rejected": -1.2979012727737427, "logps/chosen": -462.0762634277344, "logps/rejected": -1462.7767333984375, "loss": 0.1074, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.6236302852630615, "rewards/margins": 9.779306411743164, "rewards/margins_max": 16.093631744384766, "rewards/margins_min": 3.4649829864501953, "rewards/margins_std": 8.929801940917969, "rewards/rejected": -12.402936935424805, "step": 2170 }, { "epoch": 0.69, "grad_norm": 11.125, "learning_rate": 1.3513239025516312e-06, "logits/chosen": -1.5850474834442139, "logits/rejected": -1.4049034118652344, "logps/chosen": -554.8724975585938, "logps/rejected": -1975.818359375, "loss": 0.1804, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6299610137939453, "rewards/margins": 14.073150634765625, "rewards/margins_max": 23.436885833740234, "rewards/margins_min": 4.709414482116699, "rewards/margins_std": 13.24232292175293, "rewards/rejected": -17.703113555908203, "step": 2180 }, { "epoch": 0.69, "grad_norm": 0.80078125, "learning_rate": 1.3269686369073348e-06, "logits/chosen": -1.547577142715454, "logits/rejected": -1.209017276763916, "logps/chosen": -588.4271850585938, "logps/rejected": -2137.411376953125, "loss": 0.1547, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.2934679985046387, "rewards/margins": 15.912363052368164, "rewards/margins_max": 25.526660919189453, "rewards/margins_min": 6.298065185546875, "rewards/margins_std": 13.596672058105469, "rewards/rejected": -19.20583152770996, "step": 2190 }, { "epoch": 0.69, "grad_norm": 0.47265625, "learning_rate": 1.3027553059629778e-06, "logits/chosen": -1.4048335552215576, "logits/rejected": -1.1918952465057373, "logps/chosen": -498.7955627441406, "logps/rejected": -2174.997802734375, "loss": 0.1584, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.917642116546631, "rewards/margins": 16.495616912841797, "rewards/margins_max": 23.698152542114258, "rewards/margins_min": 9.293082237243652, "rewards/margins_std": 10.185922622680664, "rewards/rejected": -19.41326141357422, "step": 2200 }, { "epoch": 0.7, "grad_norm": 3.984375, "learning_rate": 1.2786868394883617e-06, "logits/chosen": -1.5030672550201416, "logits/rejected": -1.0825165510177612, "logps/chosen": -578.2425537109375, "logps/rejected": -1289.3734130859375, "loss": 0.2603, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.369215488433838, "rewards/margins": 7.822723388671875, "rewards/margins_max": 13.195724487304688, "rewards/margins_min": 2.4497230052948, "rewards/margins_std": 7.598570346832275, "rewards/rejected": -11.191938400268555, "step": 2210 }, { "epoch": 0.7, "grad_norm": 1.15625, "learning_rate": 1.2547661497249424e-06, "logits/chosen": -1.641648530960083, "logits/rejected": -1.2975002527236938, "logps/chosen": -525.0863037109375, "logps/rejected": -1528.902099609375, "loss": 0.1054, "rewards/accuracies": 0.875, "rewards/chosen": -2.701660633087158, "rewards/margins": 10.769542694091797, "rewards/margins_max": 18.1401309967041, "rewards/margins_min": 3.3989555835723877, "rewards/margins_std": 10.423584938049316, "rewards/rejected": -13.471203804016113, "step": 2220 }, { "epoch": 0.7, "grad_norm": 3.625, "learning_rate": 1.230996131033461e-06, "logits/chosen": -1.5188440084457397, "logits/rejected": -1.2162914276123047, "logps/chosen": -498.5997619628906, "logps/rejected": -1448.3214111328125, "loss": 0.1419, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.8506922721862793, "rewards/margins": 9.72119140625, "rewards/margins_max": 14.393712997436523, "rewards/margins_min": 5.04866886138916, "rewards/margins_std": 6.607945442199707, "rewards/rejected": -12.571883201599121, "step": 2230 }, { "epoch": 0.71, "grad_norm": 0.11767578125, "learning_rate": 1.2073796595437262e-06, "logits/chosen": -1.6625356674194336, "logits/rejected": -1.2526956796646118, "logps/chosen": -525.0848388671875, "logps/rejected": -1451.570556640625, "loss": 0.086, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.856308698654175, "rewards/margins": 9.767531394958496, "rewards/margins_max": 15.09489917755127, "rewards/margins_min": 4.440166473388672, "rewards/margins_std": 7.534033298492432, "rewards/rejected": -12.623841285705566, "step": 2240 }, { "epoch": 0.71, "grad_norm": 1.6484375, "learning_rate": 1.1839195928066101e-06, "logits/chosen": -1.7187540531158447, "logits/rejected": -1.3239184617996216, "logps/chosen": -541.1563720703125, "logps/rejected": -1764.908203125, "loss": 0.2656, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.0002570152282715, "rewards/margins": 12.637992858886719, "rewards/margins_max": 20.210735321044922, "rewards/margins_min": 5.065249919891357, "rewards/margins_std": 10.709476470947266, "rewards/rejected": -15.638249397277832, "step": 2250 }, { "epoch": 0.71, "grad_norm": 2.109375, "learning_rate": 1.1606187694482896e-06, "logits/chosen": -1.4333114624023438, "logits/rejected": -1.2259962558746338, "logps/chosen": -660.5386962890625, "logps/rejected": -2046.311767578125, "loss": 0.1141, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.156102180480957, "rewards/margins": 14.356404304504395, "rewards/margins_max": 21.486778259277344, "rewards/margins_min": 7.226031303405762, "rewards/margins_std": 10.083871841430664, "rewards/rejected": -17.512508392333984, "step": 2260 }, { "epoch": 0.72, "grad_norm": 7.1875, "learning_rate": 1.1374800088267768e-06, "logits/chosen": -1.4923628568649292, "logits/rejected": -1.036879539489746, "logps/chosen": -591.9725952148438, "logps/rejected": -1538.219970703125, "loss": 0.1484, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.318856716156006, "rewards/margins": 10.050863265991211, "rewards/margins_max": 16.637432098388672, "rewards/margins_min": 3.4642951488494873, "rewards/margins_std": 9.314813613891602, "rewards/rejected": -13.369720458984375, "step": 2270 }, { "epoch": 0.72, "grad_norm": 0.609375, "learning_rate": 1.1145061106907804e-06, "logits/chosen": -1.5123519897460938, "logits/rejected": -1.381981611251831, "logps/chosen": -510.7267150878906, "logps/rejected": -2111.06396484375, "loss": 0.1678, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.9295575618743896, "rewards/margins": 15.46514892578125, "rewards/margins_max": 24.086877822875977, "rewards/margins_min": 6.843419551849365, "rewards/margins_std": 12.192965507507324, "rewards/rejected": -18.39470672607422, "step": 2280 }, { "epoch": 0.72, "grad_norm": 10.875, "learning_rate": 1.0916998548409449e-06, "logits/chosen": -1.4227778911590576, "logits/rejected": -1.327239751815796, "logps/chosen": -492.4085998535156, "logps/rejected": -2177.32177734375, "loss": 0.1216, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.802885055541992, "rewards/margins": 16.447601318359375, "rewards/margins_max": 25.75341796875, "rewards/margins_min": 7.141783237457275, "rewards/margins_std": 13.160412788391113, "rewards/rejected": -19.250484466552734, "step": 2290 }, { "epoch": 0.72, "grad_norm": 3.71875, "learning_rate": 1.069064000793498e-06, "logits/chosen": -1.5011472702026367, "logits/rejected": -1.0257246494293213, "logps/chosen": -586.6893310546875, "logps/rejected": -1570.8746337890625, "loss": 0.1362, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.1940226554870605, "rewards/margins": 10.318269729614258, "rewards/margins_max": 16.737285614013672, "rewards/margins_min": 3.8992531299591064, "rewards/margins_std": 9.077858924865723, "rewards/rejected": -13.512290954589844, "step": 2300 }, { "epoch": 0.73, "grad_norm": 17.625, "learning_rate": 1.0466012874463508e-06, "logits/chosen": -1.4119497537612915, "logits/rejected": -1.2268279790878296, "logps/chosen": -589.5607299804688, "logps/rejected": -1788.9788818359375, "loss": 0.179, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.1833550930023193, "rewards/margins": 12.280914306640625, "rewards/margins_max": 18.877552032470703, "rewards/margins_min": 5.684277534484863, "rewards/margins_std": 9.32905387878418, "rewards/rejected": -15.464271545410156, "step": 2310 }, { "epoch": 0.73, "grad_norm": 1.5, "learning_rate": 1.0243144327477015e-06, "logits/chosen": -1.6249080896377563, "logits/rejected": -1.1828008890151978, "logps/chosen": -485.2342224121094, "logps/rejected": -1421.8602294921875, "loss": 0.1631, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.5701119899749756, "rewards/margins": 9.575872421264648, "rewards/margins_max": 14.847213745117188, "rewards/margins_min": 4.304533004760742, "rewards/margins_std": 7.454801082611084, "rewards/rejected": -12.145986557006836, "step": 2320 }, { "epoch": 0.73, "grad_norm": 0.59375, "learning_rate": 1.0022061333671649e-06, "logits/chosen": -1.4596678018569946, "logits/rejected": -1.163874864578247, "logps/chosen": -494.34228515625, "logps/rejected": -1985.4769287109375, "loss": 0.1316, "rewards/accuracies": 1.0, "rewards/chosen": -2.6837120056152344, "rewards/margins": 15.144342422485352, "rewards/margins_max": 24.748046875, "rewards/margins_min": 5.540638446807861, "rewards/margins_std": 13.581687927246094, "rewards/rejected": -17.828052520751953, "step": 2330 }, { "epoch": 0.74, "grad_norm": 26.25, "learning_rate": 9.802790643694817e-07, "logits/chosen": -1.541244626045227, "logits/rejected": -1.4795812368392944, "logps/chosen": -583.5311889648438, "logps/rejected": -1613.3099365234375, "loss": 0.1727, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.8347973823547363, "rewards/margins": 10.287752151489258, "rewards/margins_max": 16.781322479248047, "rewards/margins_min": 3.7941813468933105, "rewards/margins_std": 9.183296203613281, "rewards/rejected": -14.122549057006836, "step": 2340 }, { "epoch": 0.74, "grad_norm": 2.265625, "learning_rate": 9.585358788908395e-07, "logits/chosen": -1.5370924472808838, "logits/rejected": -1.3602590560913086, "logps/chosen": -525.8290405273438, "logps/rejected": -2051.8203125, "loss": 0.1576, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9414329528808594, "rewards/margins": 15.10389232635498, "rewards/margins_max": 26.046443939208984, "rewards/margins_min": 4.161340236663818, "rewards/margins_std": 15.475103378295898, "rewards/rejected": -18.045324325561523, "step": 2350 }, { "epoch": 0.74, "grad_norm": 3.734375, "learning_rate": 9.369792078178491e-07, "logits/chosen": -1.6488637924194336, "logits/rejected": -1.4893319606781006, "logps/chosen": -568.359375, "logps/rejected": -2058.3525390625, "loss": 0.4887, "rewards/accuracies": 1.0, "rewards/chosen": -3.25250506401062, "rewards/margins": 15.005627632141113, "rewards/margins_max": 20.63724708557129, "rewards/margins_min": 9.374009132385254, "rewards/margins_std": 7.964312553405762, "rewards/rejected": -18.258129119873047, "step": 2360 }, { "epoch": 0.75, "grad_norm": 19.125, "learning_rate": 9.156116594692097e-07, "logits/chosen": -1.618363618850708, "logits/rejected": -1.1712907552719116, "logps/chosen": -550.4625244140625, "logps/rejected": -1597.6376953125, "loss": 0.1817, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.1480445861816406, "rewards/margins": 10.727932929992676, "rewards/margins_max": 17.022151947021484, "rewards/margins_min": 4.433710098266602, "rewards/margins_std": 8.901372909545898, "rewards/rejected": -13.8759765625, "step": 2370 }, { "epoch": 0.75, "grad_norm": 1.8125, "learning_rate": 8.944358192801103e-07, "logits/chosen": -1.6205739974975586, "logits/rejected": -1.2058194875717163, "logps/chosen": -495.60015869140625, "logps/rejected": -1743.065673828125, "loss": 0.1038, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.684584379196167, "rewards/margins": 12.865442276000977, "rewards/margins_max": 19.519107818603516, "rewards/margins_min": 6.211775302886963, "rewards/margins_std": 9.409704208374023, "rewards/rejected": -15.550024032592773, "step": 2380 }, { "epoch": 0.75, "grad_norm": 4.9375, "learning_rate": 8.734542494893955e-07, "logits/chosen": -1.6140270233154297, "logits/rejected": -1.5208282470703125, "logps/chosen": -522.6798095703125, "logps/rejected": -1974.721923828125, "loss": 0.2237, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.987030506134033, "rewards/margins": 14.091033935546875, "rewards/margins_max": 23.622156143188477, "rewards/margins_min": 4.55991268157959, "rewards/margins_std": 13.479040145874023, "rewards/rejected": -17.07806396484375, "step": 2390 }, { "epoch": 0.76, "grad_norm": 7.90625, "learning_rate": 8.526694888295356e-07, "logits/chosen": -1.53464674949646, "logits/rejected": -1.3867006301879883, "logps/chosen": -515.68994140625, "logps/rejected": -1828.5380859375, "loss": 0.1258, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.8848297595977783, "rewards/margins": 13.052827835083008, "rewards/margins_max": 18.821996688842773, "rewards/margins_min": 7.283656120300293, "rewards/margins_std": 8.158838272094727, "rewards/rejected": -15.937657356262207, "step": 2400 }, { "epoch": 0.76, "grad_norm": 1.8125, "learning_rate": 8.320840522194507e-07, "logits/chosen": -1.4935245513916016, "logits/rejected": -1.3408067226409912, "logps/chosen": -532.34765625, "logps/rejected": -1617.862548828125, "loss": 0.161, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.95365834236145, "rewards/margins": 10.888130187988281, "rewards/margins_max": 17.86824607849121, "rewards/margins_min": 3.9080147743225098, "rewards/margins_std": 9.871374130249023, "rewards/rejected": -13.841787338256836, "step": 2410 }, { "epoch": 0.76, "grad_norm": 4.4375, "learning_rate": 8.117004304602052e-07, "logits/chosen": -1.562991976737976, "logits/rejected": -1.2025073766708374, "logps/chosen": -648.80029296875, "logps/rejected": -1497.7991943359375, "loss": 0.1952, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.700782060623169, "rewards/margins": 9.072282791137695, "rewards/margins_max": 14.986943244934082, "rewards/margins_min": 3.157620906829834, "rewards/margins_std": 8.364594459533691, "rewards/rejected": -12.773064613342285, "step": 2420 }, { "epoch": 0.77, "grad_norm": 1.2734375, "learning_rate": 7.915210899336284e-07, "logits/chosen": -1.7214120626449585, "logits/rejected": -1.458532691001892, "logps/chosen": -510.141845703125, "logps/rejected": -1868.0325927734375, "loss": 0.1117, "rewards/accuracies": 1.0, "rewards/chosen": -2.908207416534424, "rewards/margins": 13.193037033081055, "rewards/margins_max": 20.153043746948242, "rewards/margins_min": 6.2330322265625, "rewards/margins_std": 9.842935562133789, "rewards/rejected": -16.101245880126953, "step": 2430 }, { "epoch": 0.77, "grad_norm": 0.1953125, "learning_rate": 7.715484723038838e-07, "logits/chosen": -1.3019468784332275, "logits/rejected": -1.161091923713684, "logps/chosen": -487.67724609375, "logps/rejected": -1867.3681640625, "loss": 0.2212, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.646726608276367, "rewards/margins": 13.512460708618164, "rewards/margins_max": 21.958669662475586, "rewards/margins_min": 5.066250324249268, "rewards/margins_std": 11.944744110107422, "rewards/rejected": -16.1591854095459, "step": 2440 }, { "epoch": 0.77, "grad_norm": 0.205078125, "learning_rate": 7.517849942220348e-07, "logits/chosen": -1.38827645778656, "logits/rejected": -1.091671109199524, "logps/chosen": -601.8095092773438, "logps/rejected": -1584.5819091796875, "loss": 0.1188, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.90714693069458, "rewards/margins": 9.80831241607666, "rewards/margins_max": 14.877908706665039, "rewards/margins_min": 4.738718032836914, "rewards/margins_std": 7.169489860534668, "rewards/rejected": -13.715458869934082, "step": 2450 }, { "epoch": 0.77, "grad_norm": 4.5625, "learning_rate": 7.322330470336314e-07, "logits/chosen": -1.420436143875122, "logits/rejected": -1.3260561227798462, "logps/chosen": -494.6438903808594, "logps/rejected": -1588.2587890625, "loss": 0.104, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.879929304122925, "rewards/margins": 11.048105239868164, "rewards/margins_max": 16.661272048950195, "rewards/margins_min": 5.434937953948975, "rewards/margins_std": 7.938216209411621, "rewards/rejected": -13.928033828735352, "step": 2460 }, { "epoch": 0.78, "grad_norm": 2.828125, "learning_rate": 7.128949964893648e-07, "logits/chosen": -1.5485414266586304, "logits/rejected": -1.2311375141143799, "logps/chosen": -573.132080078125, "logps/rejected": -1600.142822265625, "loss": 0.2518, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.2259268760681152, "rewards/margins": 10.486474990844727, "rewards/margins_max": 15.399114608764648, "rewards/margins_min": 5.5738325119018555, "rewards/margins_std": 6.947523593902588, "rewards/rejected": -13.712400436401367, "step": 2470 }, { "epoch": 0.78, "grad_norm": 0.9453125, "learning_rate": 6.937731824588143e-07, "logits/chosen": -1.4667555093765259, "logits/rejected": -1.3924285173416138, "logps/chosen": -531.763916015625, "logps/rejected": -1142.96142578125, "loss": 0.1528, "rewards/accuracies": 0.875, "rewards/chosen": -3.6806187629699707, "rewards/margins": 6.147883892059326, "rewards/margins_max": 9.388947486877441, "rewards/margins_min": 2.9068212509155273, "rewards/margins_std": 4.583555698394775, "rewards/rejected": -9.828502655029297, "step": 2480 }, { "epoch": 0.78, "grad_norm": 1.90625, "learning_rate": 6.74869918647325e-07, "logits/chosen": -1.3583132028579712, "logits/rejected": -1.1237436532974243, "logps/chosen": -545.1788940429688, "logps/rejected": -1591.46044921875, "loss": 0.1153, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9877545833587646, "rewards/margins": 10.712733268737793, "rewards/margins_max": 15.584787368774414, "rewards/margins_min": 5.840681076049805, "rewards/margins_std": 6.890122890472412, "rewards/rejected": -13.700488090515137, "step": 2490 }, { "epoch": 0.79, "grad_norm": 2.765625, "learning_rate": 6.561874923160591e-07, "logits/chosen": -1.5355894565582275, "logits/rejected": -1.1774029731750488, "logps/chosen": -546.0108642578125, "logps/rejected": -1723.448974609375, "loss": 0.0768, "rewards/accuracies": 1.0, "rewards/chosen": -3.2266223430633545, "rewards/margins": 12.48703670501709, "rewards/margins_max": 18.832082748413086, "rewards/margins_min": 6.141989231109619, "rewards/margins_std": 8.97325325012207, "rewards/rejected": -15.713659286499023, "step": 2500 }, { "epoch": 0.79, "grad_norm": 1.390625, "learning_rate": 6.377281640052358e-07, "logits/chosen": -1.7051270008087158, "logits/rejected": -1.4254144430160522, "logps/chosen": -469.79913330078125, "logps/rejected": -1844.483642578125, "loss": 0.1454, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.7268261909484863, "rewards/margins": 13.270780563354492, "rewards/margins_max": 20.773021697998047, "rewards/margins_min": 5.7685441970825195, "rewards/margins_std": 10.60976791381836, "rewards/rejected": -15.99760913848877, "step": 2510 }, { "epoch": 0.79, "grad_norm": 0.2216796875, "learning_rate": 6.194941672606131e-07, "logits/chosen": -1.5394561290740967, "logits/rejected": -1.3444119691848755, "logps/chosen": -449.86639404296875, "logps/rejected": -1816.393310546875, "loss": 0.1036, "rewards/accuracies": 1.0, "rewards/chosen": -2.6189093589782715, "rewards/margins": 13.65289306640625, "rewards/margins_max": 22.08847999572754, "rewards/margins_min": 5.217305660247803, "rewards/margins_std": 11.92972183227539, "rewards/rejected": -16.271804809570312, "step": 2520 }, { "epoch": 0.8, "grad_norm": 0.10498046875, "learning_rate": 6.01487708363232e-07, "logits/chosen": -1.5326162576675415, "logits/rejected": -1.2514145374298096, "logps/chosen": -559.52685546875, "logps/rejected": -1785.101806640625, "loss": 0.1695, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.237330675125122, "rewards/margins": 12.128488540649414, "rewards/margins_max": 17.351224899291992, "rewards/margins_min": 6.905752658843994, "rewards/margins_std": 7.386063575744629, "rewards/rejected": -15.365817070007324, "step": 2530 }, { "epoch": 0.8, "grad_norm": 0.8125, "learning_rate": 5.837109660624607e-07, "logits/chosen": -1.5613453388214111, "logits/rejected": -1.2829174995422363, "logps/chosen": -627.0020141601562, "logps/rejected": -1759.3873291015625, "loss": 0.1456, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.9718761444091797, "rewards/margins": 11.263439178466797, "rewards/margins_max": 16.407276153564453, "rewards/margins_min": 6.119601249694824, "rewards/margins_std": 7.274483680725098, "rewards/rejected": -15.235315322875977, "step": 2540 }, { "epoch": 0.8, "grad_norm": 0.765625, "learning_rate": 5.661660913123673e-07, "logits/chosen": -1.7146743535995483, "logits/rejected": -1.536327600479126, "logps/chosen": -526.9888305664062, "logps/rejected": -1644.693603515625, "loss": 0.0667, "rewards/accuracies": 1.0, "rewards/chosen": -3.136125087738037, "rewards/margins": 11.309582710266113, "rewards/margins_max": 17.983318328857422, "rewards/margins_min": 4.63584566116333, "rewards/margins_std": 9.438088417053223, "rewards/rejected": -14.445707321166992, "step": 2550 }, { "epoch": 0.81, "grad_norm": 1.1875, "learning_rate": 5.488552070114633e-07, "logits/chosen": -1.413991928100586, "logits/rejected": -1.246726155281067, "logps/chosen": -629.1475219726562, "logps/rejected": -2350.851318359375, "loss": 0.1988, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.112383842468262, "rewards/margins": 17.097023010253906, "rewards/margins_max": 28.80392837524414, "rewards/margins_min": 5.390118598937988, "rewards/margins_std": 16.556058883666992, "rewards/rejected": -21.209407806396484, "step": 2560 }, { "epoch": 0.81, "grad_norm": 0.75, "learning_rate": 5.317804077458324e-07, "logits/chosen": -1.5566442012786865, "logits/rejected": -1.168034315109253, "logps/chosen": -623.1751708984375, "logps/rejected": -1723.6129150390625, "loss": 0.1571, "rewards/accuracies": 0.875, "rewards/chosen": -3.3873374462127686, "rewards/margins": 11.160648345947266, "rewards/margins_max": 18.534343719482422, "rewards/margins_min": 3.7869505882263184, "rewards/margins_std": 10.427982330322266, "rewards/rejected": -14.54798698425293, "step": 2570 }, { "epoch": 0.81, "grad_norm": 3.078125, "learning_rate": 5.149437595356902e-07, "logits/chosen": -1.4799290895462036, "logits/rejected": -1.1635984182357788, "logps/chosen": -605.4791259765625, "logps/rejected": -1606.4259033203125, "loss": 0.1173, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.5745937824249268, "rewards/margins": 10.339719772338867, "rewards/margins_max": 16.66402816772461, "rewards/margins_min": 4.015409469604492, "rewards/margins_std": 8.943924903869629, "rewards/rejected": -13.914312362670898, "step": 2580 }, { "epoch": 0.82, "grad_norm": 4.4375, "learning_rate": 4.983472995854003e-07, "logits/chosen": -1.4681260585784912, "logits/rejected": -1.147685170173645, "logps/chosen": -592.9346923828125, "logps/rejected": -1304.0279541015625, "loss": 0.1887, "rewards/accuracies": 0.875, "rewards/chosen": -3.336982250213623, "rewards/margins": 7.98565673828125, "rewards/margins_max": 14.44367504119873, "rewards/margins_min": 1.5276384353637695, "rewards/margins_std": 9.133016586303711, "rewards/rejected": -11.322638511657715, "step": 2590 }, { "epoch": 0.82, "grad_norm": 3.796875, "learning_rate": 4.819930360369762e-07, "logits/chosen": -1.5900928974151611, "logits/rejected": -1.4449065923690796, "logps/chosen": -561.0192260742188, "logps/rejected": -1929.9560546875, "loss": 0.1423, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.454263687133789, "rewards/margins": 13.343027114868164, "rewards/margins_max": 23.15623664855957, "rewards/margins_min": 3.529816150665283, "rewards/margins_std": 13.877975463867188, "rewards/rejected": -16.797290802001953, "step": 2600 }, { "epoch": 0.82, "grad_norm": 1.75, "learning_rate": 4.658829477270996e-07, "logits/chosen": -1.6144227981567383, "logits/rejected": -1.315969467163086, "logps/chosen": -544.9019775390625, "logps/rejected": -1778.4368896484375, "loss": 0.2357, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.350193738937378, "rewards/margins": 11.645623207092285, "rewards/margins_max": 17.396743774414062, "rewards/margins_min": 5.89450216293335, "rewards/margins_std": 8.133313179016113, "rewards/rejected": -14.995816230773926, "step": 2610 }, { "epoch": 0.83, "grad_norm": 2.234375, "learning_rate": 4.5001898394768346e-07, "logits/chosen": -1.5682995319366455, "logits/rejected": -1.4007831811904907, "logps/chosen": -547.2261352539062, "logps/rejected": -1444.755126953125, "loss": 0.2085, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.3274428844451904, "rewards/margins": 9.016876220703125, "rewards/margins_max": 14.168905258178711, "rewards/margins_min": 3.8648476600646973, "rewards/margins_std": 7.286067962646484, "rewards/rejected": -12.344320297241211, "step": 2620 }, { "epoch": 0.83, "grad_norm": 1.0546875, "learning_rate": 4.344030642100133e-07, "logits/chosen": -1.7077690362930298, "logits/rejected": -1.598008632659912, "logps/chosen": -611.0703125, "logps/rejected": -2115.15673828125, "loss": 0.2851, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.431548595428467, "rewards/margins": 15.340169906616211, "rewards/margins_max": 25.29729461669922, "rewards/margins_min": 5.383046627044678, "rewards/margins_std": 14.081500053405762, "rewards/rejected": -18.77172088623047, "step": 2630 }, { "epoch": 0.83, "grad_norm": 1.34375, "learning_rate": 4.190370780124864e-07, "logits/chosen": -1.393789291381836, "logits/rejected": -1.2228848934173584, "logps/chosen": -481.9505310058594, "logps/rejected": -1663.0035400390625, "loss": 0.1675, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.922569513320923, "rewards/margins": 11.292458534240723, "rewards/margins_max": 18.119579315185547, "rewards/margins_min": 4.46533727645874, "rewards/margins_std": 9.655006408691406, "rewards/rejected": -14.215028762817383, "step": 2640 }, { "epoch": 0.83, "grad_norm": 6.3125, "learning_rate": 4.0392288461199053e-07, "logits/chosen": -1.3990025520324707, "logits/rejected": -1.3038866519927979, "logps/chosen": -567.4427490234375, "logps/rejected": -2022.3857421875, "loss": 0.1482, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.3987483978271484, "rewards/margins": 14.682683944702148, "rewards/margins_max": 23.64312744140625, "rewards/margins_min": 5.722237586975098, "rewards/margins_std": 12.671981811523438, "rewards/rejected": -18.081430435180664, "step": 2650 }, { "epoch": 0.84, "grad_norm": 5.625, "learning_rate": 3.8906231279893433e-07, "logits/chosen": -1.3695005178451538, "logits/rejected": -1.2840726375579834, "logps/chosen": -545.4678955078125, "logps/rejected": -2054.255126953125, "loss": 0.2044, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.088050365447998, "rewards/margins": 15.608365058898926, "rewards/margins_max": 25.990718841552734, "rewards/margins_min": 5.226010322570801, "rewards/margins_std": 14.682866096496582, "rewards/rejected": -18.696414947509766, "step": 2660 }, { "epoch": 0.84, "grad_norm": 0.83984375, "learning_rate": 3.7445716067596506e-07, "logits/chosen": -1.4935780763626099, "logits/rejected": -1.2695616483688354, "logps/chosen": -499.6849060058594, "logps/rejected": -1571.749755859375, "loss": 0.1548, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.89444637298584, "rewards/margins": 10.862527847290039, "rewards/margins_max": 18.293880462646484, "rewards/margins_min": 3.4311795234680176, "rewards/margins_std": 10.509515762329102, "rewards/rejected": -13.756976127624512, "step": 2670 }, { "epoch": 0.84, "grad_norm": 2.578125, "learning_rate": 3.601091954404062e-07, "logits/chosen": -1.3411470651626587, "logits/rejected": -1.1427286863327026, "logps/chosen": -603.7765502929688, "logps/rejected": -1911.207763671875, "loss": 0.1676, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6270432472229004, "rewards/margins": 13.066546440124512, "rewards/margins_max": 19.609230041503906, "rewards/margins_min": 6.523860931396484, "rewards/margins_std": 9.252754211425781, "rewards/rejected": -16.693588256835938, "step": 2680 }, { "epoch": 0.85, "grad_norm": 11.625, "learning_rate": 3.460201531704263e-07, "logits/chosen": -1.5328198671340942, "logits/rejected": -1.062266230583191, "logps/chosen": -700.7471313476562, "logps/rejected": -1610.164794921875, "loss": 0.1287, "rewards/accuracies": 1.0, "rewards/chosen": -3.032205104827881, "rewards/margins": 10.639944076538086, "rewards/margins_max": 14.988049507141113, "rewards/margins_min": 6.29184103012085, "rewards/margins_std": 6.149147987365723, "rewards/rejected": -13.672149658203125, "step": 2690 }, { "epoch": 0.85, "grad_norm": 2.96875, "learning_rate": 3.321917386149773e-07, "logits/chosen": -1.56075119972229, "logits/rejected": -1.2586215734481812, "logps/chosen": -492.829345703125, "logps/rejected": -1485.7462158203125, "loss": 0.1458, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.7940220832824707, "rewards/margins": 9.927345275878906, "rewards/margins_max": 17.084228515625, "rewards/margins_min": 2.7704620361328125, "rewards/margins_std": 10.121360778808594, "rewards/rejected": -12.721366882324219, "step": 2700 }, { "epoch": 0.85, "grad_norm": 4.71875, "learning_rate": 3.186256249875236e-07, "logits/chosen": -1.5891391038894653, "logits/rejected": -1.3650810718536377, "logps/chosen": -464.7984313964844, "logps/rejected": -1371.1409912109375, "loss": 0.3338, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.6813507080078125, "rewards/margins": 8.95352554321289, "rewards/margins_max": 13.174215316772461, "rewards/margins_min": 4.732832908630371, "rewards/margins_std": 5.968959808349609, "rewards/rejected": -11.634876251220703, "step": 2710 }, { "epoch": 0.86, "grad_norm": 0.0546875, "learning_rate": 3.0532345376358577e-07, "logits/chosen": -1.654157280921936, "logits/rejected": -1.3361161947250366, "logps/chosen": -496.3033752441406, "logps/rejected": -1795.9544677734375, "loss": 0.1439, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.100607395172119, "rewards/margins": 12.40422248840332, "rewards/margins_max": 21.095714569091797, "rewards/margins_min": 3.7127292156219482, "rewards/margins_std": 12.291626930236816, "rewards/rejected": -15.504827499389648, "step": 2720 }, { "epoch": 0.86, "grad_norm": 1.609375, "learning_rate": 2.922868344821236e-07, "logits/chosen": -1.454085350036621, "logits/rejected": -1.0795207023620605, "logps/chosen": -527.0432739257812, "logps/rejected": -1645.1402587890625, "loss": 0.1602, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.0249111652374268, "rewards/margins": 11.555765151977539, "rewards/margins_max": 18.297351837158203, "rewards/margins_min": 4.814180850982666, "rewards/margins_std": 9.534041404724121, "rewards/rejected": -14.580678939819336, "step": 2730 }, { "epoch": 0.86, "grad_norm": 2.859375, "learning_rate": 2.795173445507879e-07, "logits/chosen": -1.640484094619751, "logits/rejected": -1.2281348705291748, "logps/chosen": -674.6544799804688, "logps/rejected": -2076.06591796875, "loss": 0.4535, "rewards/accuracies": 0.875, "rewards/chosen": -4.167102813720703, "rewards/margins": 13.986814498901367, "rewards/margins_max": 21.08889389038086, "rewards/margins_min": 6.884737968444824, "rewards/margins_std": 10.043853759765625, "rewards/rejected": -18.153919219970703, "step": 2740 }, { "epoch": 0.87, "grad_norm": 1.5625, "learning_rate": 2.670165290550544e-07, "logits/chosen": -1.5252046585083008, "logits/rejected": -1.1835720539093018, "logps/chosen": -551.7774658203125, "logps/rejected": -1764.0374755859375, "loss": 0.1066, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.3922817707061768, "rewards/margins": 12.13778305053711, "rewards/margins_max": 18.606740951538086, "rewards/margins_min": 5.668824195861816, "rewards/margins_std": 9.148488998413086, "rewards/rejected": -15.530064582824707, "step": 2750 }, { "epoch": 0.87, "grad_norm": 1.0703125, "learning_rate": 2.547859005712727e-07, "logits/chosen": -1.5535533428192139, "logits/rejected": -1.2417479753494263, "logps/chosen": -521.0472412109375, "logps/rejected": -1648.343505859375, "loss": 0.1712, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.0611438751220703, "rewards/margins": 11.504739761352539, "rewards/margins_max": 19.227418899536133, "rewards/margins_min": 3.7820606231689453, "rewards/margins_std": 10.921515464782715, "rewards/rejected": -14.565884590148926, "step": 2760 }, { "epoch": 0.87, "grad_norm": 1.0703125, "learning_rate": 2.4282693898364435e-07, "logits/chosen": -1.5507018566131592, "logits/rejected": -1.2115910053253174, "logps/chosen": -471.463134765625, "logps/rejected": -1789.766357421875, "loss": 0.1518, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9074912071228027, "rewards/margins": 13.214210510253906, "rewards/margins_max": 19.92336654663086, "rewards/margins_min": 6.505056858062744, "rewards/margins_std": 9.488176345825195, "rewards/rejected": -16.121702194213867, "step": 2770 }, { "epoch": 0.88, "grad_norm": 10.9375, "learning_rate": 2.3114109130516427e-07, "logits/chosen": -1.488516092300415, "logits/rejected": -1.2162216901779175, "logps/chosen": -559.1824340820312, "logps/rejected": -1675.4085693359375, "loss": 0.2286, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.735757350921631, "rewards/margins": 10.952620506286621, "rewards/margins_max": 17.202655792236328, "rewards/margins_min": 4.702585220336914, "rewards/margins_std": 8.838884353637695, "rewards/rejected": -14.688379287719727, "step": 2780 }, { "epoch": 0.88, "grad_norm": 1.1171875, "learning_rate": 2.1972977150253066e-07, "logits/chosen": -1.6375281810760498, "logits/rejected": -1.2000789642333984, "logps/chosen": -563.3726806640625, "logps/rejected": -2449.58251953125, "loss": 0.0716, "rewards/accuracies": 1.0, "rewards/chosen": -3.116568088531494, "rewards/margins": 18.522953033447266, "rewards/margins_max": 31.61623191833496, "rewards/margins_min": 5.429670810699463, "rewards/margins_std": 18.516695022583008, "rewards/rejected": -21.63951873779297, "step": 2790 }, { "epoch": 0.88, "grad_norm": 1.0390625, "learning_rate": 2.0859436032505954e-07, "logits/chosen": -1.5669145584106445, "logits/rejected": -1.4212372303009033, "logps/chosen": -529.2255859375, "logps/rejected": -2067.349853515625, "loss": 0.2644, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.3644447326660156, "rewards/margins": 15.288610458374023, "rewards/margins_max": 23.236772537231445, "rewards/margins_min": 7.340449333190918, "rewards/margins_std": 11.240400314331055, "rewards/rejected": -18.65305519104004, "step": 2800 }, { "epoch": 0.89, "grad_norm": 1.3984375, "learning_rate": 1.9773620513761582e-07, "logits/chosen": -1.544468641281128, "logits/rejected": -1.2507742643356323, "logps/chosen": -479.28717041015625, "logps/rejected": -1413.136474609375, "loss": 0.1099, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.6804263591766357, "rewards/margins": 9.630419731140137, "rewards/margins_max": 16.187856674194336, "rewards/margins_min": 3.072981834411621, "rewards/margins_std": 9.273618698120117, "rewards/rejected": -12.310846328735352, "step": 2810 }, { "epoch": 0.89, "grad_norm": 5.375, "learning_rate": 1.8715661975758524e-07, "logits/chosen": -1.3692967891693115, "logits/rejected": -1.2908586263656616, "logps/chosen": -471.1861877441406, "logps/rejected": -1936.43359375, "loss": 0.0775, "rewards/accuracies": 1.0, "rewards/chosen": -3.0035228729248047, "rewards/margins": 13.9976167678833, "rewards/margins_max": 23.008455276489258, "rewards/margins_min": 4.9867753982543945, "rewards/margins_std": 12.74325180053711, "rewards/rejected": -17.001136779785156, "step": 2820 }, { "epoch": 0.89, "grad_norm": 0.77734375, "learning_rate": 1.768568842959037e-07, "logits/chosen": -1.56259286403656, "logits/rejected": -1.225875735282898, "logps/chosen": -666.2752685546875, "logps/rejected": -2096.91845703125, "loss": 0.0707, "rewards/accuracies": 1.0, "rewards/chosen": -4.032459735870361, "rewards/margins": 14.605703353881836, "rewards/margins_max": 19.857891082763672, "rewards/margins_min": 9.353516578674316, "rewards/margins_std": 7.4277143478393555, "rewards/rejected": -18.63816261291504, "step": 2830 }, { "epoch": 0.89, "grad_norm": 3.71875, "learning_rate": 1.6683824500216662e-07, "logits/chosen": -1.4535073041915894, "logits/rejected": -1.2454532384872437, "logps/chosen": -480.98455810546875, "logps/rejected": -1676.016845703125, "loss": 0.1018, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.708672285079956, "rewards/margins": 12.352299690246582, "rewards/margins_max": 20.519662857055664, "rewards/margins_min": 4.184937477111816, "rewards/margins_std": 11.550395011901855, "rewards/rejected": -15.060972213745117, "step": 2840 }, { "epoch": 0.9, "grad_norm": 1.5078125, "learning_rate": 1.5710191411383663e-07, "logits/chosen": -1.5043429136276245, "logits/rejected": -1.3213595151901245, "logps/chosen": -480.7269592285156, "logps/rejected": -1419.52197265625, "loss": 0.2321, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.0613503456115723, "rewards/margins": 9.34998607635498, "rewards/margins_max": 15.409167289733887, "rewards/margins_min": 3.2908051013946533, "rewards/margins_std": 8.568977355957031, "rewards/rejected": -12.411336898803711, "step": 2850 }, { "epoch": 0.9, "grad_norm": 4.0625, "learning_rate": 1.4764906970956145e-07, "logits/chosen": -1.5083320140838623, "logits/rejected": -1.259227991104126, "logps/chosen": -497.8409118652344, "logps/rejected": -1558.669677734375, "loss": 0.1545, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.007861375808716, "rewards/margins": 10.634368896484375, "rewards/margins_max": 15.329747200012207, "rewards/margins_min": 5.938992977142334, "rewards/margins_std": 6.640265464782715, "rewards/rejected": -13.642231941223145, "step": 2860 }, { "epoch": 0.9, "grad_norm": 1.2421875, "learning_rate": 1.38480855566632e-07, "logits/chosen": -1.4352874755859375, "logits/rejected": -1.1588048934936523, "logps/chosen": -626.794677734375, "logps/rejected": -1690.177490234375, "loss": 0.1367, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.555635452270508, "rewards/margins": 11.343018531799316, "rewards/margins_max": 18.7399845123291, "rewards/margins_min": 3.9460530281066895, "rewards/margins_std": 10.460888862609863, "rewards/rejected": -14.898653030395508, "step": 2870 }, { "epoch": 0.91, "grad_norm": 11.125, "learning_rate": 1.2959838102258537e-07, "logits/chosen": -1.5202867984771729, "logits/rejected": -1.2198843955993652, "logps/chosen": -686.660888671875, "logps/rejected": -1677.017333984375, "loss": 0.2712, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.959693431854248, "rewards/margins": 10.280670166015625, "rewards/margins_max": 15.942033767700195, "rewards/margins_min": 4.619304656982422, "rewards/margins_std": 8.006379127502441, "rewards/rejected": -14.240364074707031, "step": 2880 }, { "epoch": 0.91, "grad_norm": 0.263671875, "learning_rate": 1.210027208409778e-07, "logits/chosen": -1.4056593179702759, "logits/rejected": -1.3017997741699219, "logps/chosen": -444.84088134765625, "logps/rejected": -2359.341796875, "loss": 0.1585, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.568777084350586, "rewards/margins": 18.554290771484375, "rewards/margins_max": 29.505813598632812, "rewards/margins_min": 7.6027679443359375, "rewards/margins_std": 15.48779296875, "rewards/rejected": -21.123065948486328, "step": 2890 }, { "epoch": 0.91, "grad_norm": 0.578125, "learning_rate": 1.1269491508133945e-07, "logits/chosen": -1.6786344051361084, "logits/rejected": -1.1672778129577637, "logps/chosen": -641.666259765625, "logps/rejected": -1821.4290771484375, "loss": 0.1808, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.2502269744873047, "rewards/margins": 12.766157150268555, "rewards/margins_max": 18.502321243286133, "rewards/margins_min": 7.029994010925293, "rewards/margins_std": 8.112160682678223, "rewards/rejected": -16.01638412475586, "step": 2900 }, { "epoch": 0.92, "grad_norm": 0.94921875, "learning_rate": 1.046759689733301e-07, "logits/chosen": -1.5015394687652588, "logits/rejected": -1.077661156654358, "logps/chosen": -610.1318359375, "logps/rejected": -1724.026123046875, "loss": 0.0675, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.742875337600708, "rewards/margins": 11.29780101776123, "rewards/margins_max": 18.355480194091797, "rewards/margins_min": 4.240123748779297, "rewards/margins_std": 9.981063842773438, "rewards/rejected": -15.040678024291992, "step": 2910 }, { "epoch": 0.92, "grad_norm": 0.419921875, "learning_rate": 9.694685279510674e-08, "logits/chosen": -1.4848477840423584, "logits/rejected": -1.453749656677246, "logps/chosen": -521.6805419921875, "logps/rejected": -1747.800537109375, "loss": 0.2553, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.3396506309509277, "rewards/margins": 11.84017276763916, "rewards/margins_max": 18.36767578125, "rewards/margins_min": 5.312668800354004, "rewards/margins_std": 9.231285095214844, "rewards/rejected": -15.179824829101562, "step": 2920 }, { "epoch": 0.92, "grad_norm": 2.03125, "learning_rate": 8.950850175592329e-08, "logits/chosen": -1.5855342149734497, "logits/rejected": -1.3711344003677368, "logps/chosen": -523.5225219726562, "logps/rejected": -1761.386962890625, "loss": 0.1607, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.866356134414673, "rewards/margins": 12.065530776977539, "rewards/margins_max": 19.174335479736328, "rewards/margins_min": 4.956724166870117, "rewards/margins_std": 10.053369522094727, "rewards/rejected": -14.931884765625, "step": 2930 }, { "epoch": 0.93, "grad_norm": 0.474609375, "learning_rate": 8.236181588297115e-08, "logits/chosen": -1.4377137422561646, "logits/rejected": -1.2142530679702759, "logps/chosen": -591.1087646484375, "logps/rejected": -2312.2509765625, "loss": 0.0935, "rewards/accuracies": 1.0, "rewards/chosen": -3.312687397003174, "rewards/margins": 16.7128963470459, "rewards/margins_max": 25.945785522460938, "rewards/margins_min": 7.480005741119385, "rewards/margins_std": 13.057278633117676, "rewards/rejected": -20.025583267211914, "step": 2940 }, { "epoch": 0.93, "grad_norm": 0.4140625, "learning_rate": 7.550765991247655e-08, "logits/chosen": -1.5337202548980713, "logits/rejected": -1.193544626235962, "logps/chosen": -589.0224609375, "logps/rejected": -1560.7672119140625, "loss": 0.1258, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.4798824787139893, "rewards/margins": 9.967602729797363, "rewards/margins_max": 15.469675064086914, "rewards/margins_min": 4.465529918670654, "rewards/margins_std": 7.781105041503906, "rewards/rejected": -13.447484970092773, "step": 2950 }, { "epoch": 0.93, "grad_norm": 2.1875, "learning_rate": 6.894686318507066e-08, "logits/chosen": -1.5093646049499512, "logits/rejected": -1.3388113975524902, "logps/chosen": -513.3501586914062, "logps/rejected": -1921.223388671875, "loss": 0.1717, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.021483898162842, "rewards/margins": 13.66453742980957, "rewards/margins_max": 22.030460357666016, "rewards/margins_min": 5.2986159324646, "rewards/margins_std": 11.831199645996094, "rewards/rejected": -16.68602180480957, "step": 2960 }, { "epoch": 0.94, "grad_norm": 0.1123046875, "learning_rate": 6.268021954544095e-08, "logits/chosen": -1.2859686613082886, "logits/rejected": -1.2988886833190918, "logps/chosen": -521.9822387695312, "logps/rejected": -2064.39013671875, "loss": 0.3054, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.2040743827819824, "rewards/margins": 14.57214069366455, "rewards/margins_max": 24.986371994018555, "rewards/margins_min": 4.15791130065918, "rewards/margins_std": 14.727948188781738, "rewards/rejected": -17.776216506958008, "step": 2970 }, { "epoch": 0.94, "grad_norm": 10.5, "learning_rate": 5.670848724627531e-08, "logits/chosen": -1.6210033893585205, "logits/rejected": -1.280139684677124, "logps/chosen": -725.0421142578125, "logps/rejected": -1452.880859375, "loss": 0.1814, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.209847450256348, "rewards/margins": 8.341692924499512, "rewards/margins_max": 12.187960624694824, "rewards/margins_min": 4.495427131652832, "rewards/margins_std": 5.4394426345825195, "rewards/rejected": -12.55154037475586, "step": 2980 }, { "epoch": 0.94, "grad_norm": 1.765625, "learning_rate": 5.103238885651618e-08, "logits/chosen": -1.5707718133926392, "logits/rejected": -1.2055985927581787, "logps/chosen": -536.3211059570312, "logps/rejected": -1798.921142578125, "loss": 0.1463, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.944525718688965, "rewards/margins": 12.846099853515625, "rewards/margins_max": 19.72370147705078, "rewards/margins_min": 5.968496322631836, "rewards/margins_std": 9.726399421691895, "rewards/rejected": -15.790626525878906, "step": 2990 }, { "epoch": 0.95, "grad_norm": 3.125, "learning_rate": 4.5652611173932495e-08, "logits/chosen": -1.715071439743042, "logits/rejected": -1.393758773803711, "logps/chosen": -574.81689453125, "logps/rejected": -1423.7811279296875, "loss": 0.1692, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.3229072093963623, "rewards/margins": 8.942388534545898, "rewards/margins_max": 14.589694023132324, "rewards/margins_min": 3.2950832843780518, "rewards/margins_std": 7.986494541168213, "rewards/rejected": -12.265295028686523, "step": 3000 }, { "epoch": 0.95, "grad_norm": 0.0322265625, "learning_rate": 4.0569805142014476e-08, "logits/chosen": -1.424754023551941, "logits/rejected": -1.2524802684783936, "logps/chosen": -531.279296875, "logps/rejected": -1886.6539306640625, "loss": 0.0838, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2407214641571045, "rewards/margins": 13.494546890258789, "rewards/margins_max": 21.47197723388672, "rewards/margins_min": 5.517115116119385, "rewards/margins_std": 11.281791687011719, "rewards/rejected": -16.735267639160156, "step": 3010 }, { "epoch": 0.95, "grad_norm": 0.427734375, "learning_rate": 3.578458577121524e-08, "logits/chosen": -1.4714146852493286, "logits/rejected": -1.2563064098358154, "logps/chosen": -528.0980224609375, "logps/rejected": -1789.8597412109375, "loss": 0.0911, "rewards/accuracies": 1.0, "rewards/chosen": -3.484973192214966, "rewards/margins": 12.645235061645508, "rewards/margins_max": 18.291662216186523, "rewards/margins_min": 6.998806953430176, "rewards/margins_std": 7.985255241394043, "rewards/rejected": -16.13020896911621, "step": 3020 }, { "epoch": 0.95, "grad_norm": 0.31640625, "learning_rate": 3.129753206453201e-08, "logits/chosen": -1.6428391933441162, "logits/rejected": -1.2820537090301514, "logps/chosen": -585.875, "logps/rejected": -1616.100830078125, "loss": 0.1028, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.469578981399536, "rewards/margins": 10.344371795654297, "rewards/margins_max": 15.5939359664917, "rewards/margins_min": 5.094809532165527, "rewards/margins_std": 7.424003601074219, "rewards/rejected": -13.81395149230957, "step": 3030 }, { "epoch": 0.96, "grad_norm": 7.125, "learning_rate": 2.710918694744935e-08, "logits/chosen": -1.6528517007827759, "logits/rejected": -1.471823811531067, "logps/chosen": -563.698486328125, "logps/rejected": -1610.182373046875, "loss": 0.2088, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.7492880821228027, "rewards/margins": 10.301797866821289, "rewards/margins_max": 16.012958526611328, "rewards/margins_min": 4.59063720703125, "rewards/margins_std": 8.076800346374512, "rewards/rejected": -14.05108642578125, "step": 3040 }, { "epoch": 0.96, "grad_norm": 4.5625, "learning_rate": 2.3220057202246183e-08, "logits/chosen": -1.333348035812378, "logits/rejected": -1.0714836120605469, "logps/chosen": -471.2120666503906, "logps/rejected": -1886.3255615234375, "loss": 0.1422, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9049346446990967, "rewards/margins": 13.638298034667969, "rewards/margins_max": 22.545673370361328, "rewards/margins_min": 4.730922698974609, "rewards/margins_std": 12.596931457519531, "rewards/rejected": -16.543231964111328, "step": 3050 }, { "epoch": 0.96, "grad_norm": 0.51953125, "learning_rate": 1.963061340667677e-08, "logits/chosen": -1.5092668533325195, "logits/rejected": -1.361816167831421, "logps/chosen": -466.8147888183594, "logps/rejected": -1187.789794921875, "loss": 0.1517, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.5971789360046387, "rewards/margins": 7.532545566558838, "rewards/margins_max": 11.385088920593262, "rewards/margins_min": 3.680001735687256, "rewards/margins_std": 5.448319435119629, "rewards/rejected": -10.129724502563477, "step": 3060 }, { "epoch": 0.97, "grad_norm": 10.9375, "learning_rate": 1.6341289877028488e-08, "logits/chosen": -1.329237699508667, "logits/rejected": -1.1558626890182495, "logps/chosen": -491.35076904296875, "logps/rejected": -2207.8818359375, "loss": 0.0801, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.6653168201446533, "rewards/margins": 17.251855850219727, "rewards/margins_max": 26.020004272460938, "rewards/margins_min": 8.483704566955566, "rewards/margins_std": 12.400036811828613, "rewards/rejected": -19.917171478271484, "step": 3070 }, { "epoch": 0.97, "grad_norm": 0.8359375, "learning_rate": 1.3352484615574701e-08, "logits/chosen": -1.7151950597763062, "logits/rejected": -1.3515779972076416, "logps/chosen": -546.0389404296875, "logps/rejected": -1565.0789794921875, "loss": 0.0923, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.7018885612487793, "rewards/margins": 10.648792266845703, "rewards/margins_max": 16.55207061767578, "rewards/margins_min": 4.745510101318359, "rewards/margins_std": 8.348501205444336, "rewards/rejected": -13.350680351257324, "step": 3080 }, { "epoch": 0.97, "grad_norm": 0.36328125, "learning_rate": 1.0664559262413831e-08, "logits/chosen": -1.4656840562820435, "logits/rejected": -1.2670657634735107, "logps/chosen": -474.76611328125, "logps/rejected": -1649.747802734375, "loss": 0.1544, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.5341649055480957, "rewards/margins": 12.128092765808105, "rewards/margins_max": 20.278675079345703, "rewards/margins_min": 3.977510452270508, "rewards/margins_std": 11.526663780212402, "rewards/rejected": -14.662257194519043, "step": 3090 }, { "epoch": 0.98, "grad_norm": 1.0625, "learning_rate": 8.2778390517127e-09, "logits/chosen": -1.4516102075576782, "logits/rejected": -1.1890254020690918, "logps/chosen": -617.524658203125, "logps/rejected": -1620.347412109375, "loss": 0.096, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.6001343727111816, "rewards/margins": 10.086786270141602, "rewards/margins_max": 14.390159606933594, "rewards/margins_min": 5.783412933349609, "rewards/margins_std": 6.085890293121338, "rewards/rejected": -13.686922073364258, "step": 3100 }, { "epoch": 0.98, "grad_norm": 0.625, "learning_rate": 6.192612772354944e-09, "logits/chosen": -1.4516470432281494, "logits/rejected": -1.2507392168045044, "logps/chosen": -566.8466186523438, "logps/rejected": -2050.584228515625, "loss": 0.0997, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.1200942993164062, "rewards/margins": 14.864712715148926, "rewards/margins_max": 24.433439254760742, "rewards/margins_min": 5.295987129211426, "rewards/margins_std": 13.532221794128418, "rewards/rejected": -17.984806060791016, "step": 3110 }, { "epoch": 0.98, "grad_norm": 0.2578125, "learning_rate": 4.409132732995647e-09, "logits/chosen": -1.5317922830581665, "logits/rejected": -1.300173282623291, "logps/chosen": -554.97509765625, "logps/rejected": -1691.675537109375, "loss": 0.1008, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6252989768981934, "rewards/margins": 11.502375602722168, "rewards/margins_max": 19.262727737426758, "rewards/margins_min": 3.7420215606689453, "rewards/margins_std": 10.9747953414917, "rewards/rejected": -15.127673149108887, "step": 3120 }, { "epoch": 0.99, "grad_norm": 0.33203125, "learning_rate": 2.927614731534356e-09, "logits/chosen": -1.5343540906906128, "logits/rejected": -1.3594461679458618, "logps/chosen": -524.830078125, "logps/rejected": -1884.637939453125, "loss": 0.2285, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.0702600479125977, "rewards/margins": 12.865870475769043, "rewards/margins_max": 19.372177124023438, "rewards/margins_min": 6.359560966491699, "rewards/margins_std": 9.201309204101562, "rewards/rejected": -15.936129570007324, "step": 3130 }, { "epoch": 0.99, "grad_norm": 0.32421875, "learning_rate": 1.7482380290034795e-09, "logits/chosen": -1.6508004665374756, "logits/rejected": -1.2689622640609741, "logps/chosen": -478.16522216796875, "logps/rejected": -1635.46044921875, "loss": 0.0878, "rewards/accuracies": 1.0, "rewards/chosen": -2.865492343902588, "rewards/margins": 11.581151962280273, "rewards/margins_max": 17.636213302612305, "rewards/margins_min": 5.526092529296875, "rewards/margins_std": 8.563149452209473, "rewards/rejected": -14.44664478302002, "step": 3140 }, { "epoch": 0.99, "grad_norm": 0.52734375, "learning_rate": 8.711453278778537e-10, "logits/chosen": -1.439031720161438, "logits/rejected": -1.0773530006408691, "logps/chosen": -565.1245727539062, "logps/rejected": -1736.3665771484375, "loss": 0.11, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.194706439971924, "rewards/margins": 12.026373863220215, "rewards/margins_max": 17.58086395263672, "rewards/margins_min": 6.471883296966553, "rewards/margins_std": 7.855236053466797, "rewards/rejected": -15.22107982635498, "step": 3150 }, { "epoch": 1.0, "grad_norm": 5.40625, "learning_rate": 2.964427548077242e-10, "logits/chosen": -1.5736453533172607, "logits/rejected": -1.3564589023590088, "logps/chosen": -545.9297485351562, "logps/rejected": -1428.1328125, "loss": 0.1362, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.3369319438934326, "rewards/margins": 9.009324073791504, "rewards/margins_max": 14.16156005859375, "rewards/margins_min": 3.857085704803467, "rewards/margins_std": 7.2863640785217285, "rewards/rejected": -12.3462553024292, "step": 3160 }, { "epoch": 1.0, "grad_norm": 1.25, "learning_rate": 2.419984777790596e-11, "logits/chosen": -1.4611544609069824, "logits/rejected": -1.0981907844543457, "logps/chosen": -558.7552490234375, "logps/rejected": -1721.875732421875, "loss": 0.1926, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.2622387409210205, "rewards/margins": 11.596755027770996, "rewards/margins_max": 18.492107391357422, "rewards/margins_min": 4.701401710510254, "rewards/margins_std": 9.751501083374023, "rewards/rejected": -14.858993530273438, "step": 3170 }, { "epoch": 1.0, "eval_logits/chosen": -1.1338366270065308, "eval_logits/rejected": -1.0143229961395264, "eval_logps/chosen": -640.5291137695312, "eval_logps/rejected": -746.0303955078125, "eval_loss": 0.935178279876709, "eval_rewards/accuracies": 0.6134999990463257, "eval_rewards/chosen": -3.155653715133667, "eval_rewards/margins": 1.1480075120925903, "eval_rewards/margins_max": 6.733797073364258, "eval_rewards/margins_min": -3.165346622467041, "eval_rewards/margins_std": 3.2318670749664307, "eval_rewards/rejected": -4.303661346435547, "eval_runtime": 1443.1499, "eval_samples_per_second": 2.772, "eval_steps_per_second": 0.173, "step": 3174 }, { "epoch": 1.0, "step": 3174, "total_flos": 0.0, "train_loss": 0.2596737848525537, "train_runtime": 26816.1062, "train_samples_per_second": 0.947, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 3174, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }