diff --git "a/checkpoint-303/trainer_state.json" "b/checkpoint-303/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-303/trainer_state.json" @@ -0,0 +1,4578 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6047904191616766, + "eval_steps": 500, + "global_step": 303, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001996007984031936, + "grad_norm": 8.273702760320834, + "learning_rate": 9.803921568627451e-09, + "logits/chosen": -15.345624923706055, + "logits/rejected": -15.43127727508545, + "logps/chosen": -309.0523376464844, + "logps/rejected": -315.7975769042969, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.003992015968063872, + "grad_norm": 9.403401939105745, + "learning_rate": 1.9607843137254902e-08, + "logits/chosen": -15.736159324645996, + "logits/rejected": -15.511228561401367, + "logps/chosen": -276.1156311035156, + "logps/rejected": -319.82891845703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.005988023952095809, + "grad_norm": 7.417684586141953, + "learning_rate": 2.941176470588235e-08, + "logits/chosen": -14.069502830505371, + "logits/rejected": -14.952973365783691, + "logps/chosen": -327.70660400390625, + "logps/rejected": -324.401611328125, + "loss": 0.6935, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0006795646040700376, + "rewards/margins": 0.00012883666204288602, + "rewards/rejected": 0.0005507277674041688, + "step": 3 + }, + { + "epoch": 0.007984031936127744, + "grad_norm": 8.870177799116133, + "learning_rate": 3.9215686274509804e-08, + "logits/chosen": -14.754829406738281, + "logits/rejected": -14.156275749206543, + "logps/chosen": -405.5284423828125, + "logps/rejected": -507.5711669921875, + "loss": 0.6927, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00030017876997590065, + "rewards/margins": -0.00034380401484668255, + "rewards/rejected": 4.362566687632352e-05, + "step": 4 + }, + { + "epoch": 0.00998003992015968, + "grad_norm": 8.607915464157758, + "learning_rate": 4.901960784313725e-08, + "logits/chosen": -15.998005867004395, + "logits/rejected": -15.419865608215332, + "logps/chosen": -334.4444580078125, + "logps/rejected": -347.6990051269531, + "loss": 0.6925, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.003209929447621107, + "rewards/margins": 0.006283964961767197, + "rewards/rejected": -0.0030740355141460896, + "step": 5 + }, + { + "epoch": 0.011976047904191617, + "grad_norm": 7.828043502488399, + "learning_rate": 5.88235294117647e-08, + "logits/chosen": -15.476319313049316, + "logits/rejected": -15.254171371459961, + "logps/chosen": -315.6751708984375, + "logps/rejected": -321.661376953125, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004277873318642378, + "rewards/margins": 0.0027148674707859755, + "rewards/rejected": 0.0015630058478564024, + "step": 6 + }, + { + "epoch": 0.013972055888223553, + "grad_norm": 8.411076253189734, + "learning_rate": 6.862745098039216e-08, + "logits/chosen": -15.430569648742676, + "logits/rejected": -15.730286598205566, + "logps/chosen": -329.1186218261719, + "logps/rejected": -333.16375732421875, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0016522119985893369, + "rewards/margins": 0.0027996539138257504, + "rewards/rejected": -0.004451866261661053, + "step": 7 + }, + { + "epoch": 0.015968063872255488, + "grad_norm": 8.071033025816464, + "learning_rate": 7.843137254901961e-08, + "logits/chosen": -14.63811206817627, + "logits/rejected": -15.146449089050293, + "logps/chosen": -418.5869445800781, + "logps/rejected": -409.2070007324219, + "loss": 0.6926, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0014343691291287541, + "rewards/margins": 0.0001833915594033897, + "rewards/rejected": -0.001617760630324483, + "step": 8 + }, + { + "epoch": 0.017964071856287425, + "grad_norm": 8.296615844679023, + "learning_rate": 8.823529411764706e-08, + "logits/chosen": -16.07230567932129, + "logits/rejected": -15.337064743041992, + "logps/chosen": -500.545166015625, + "logps/rejected": -516.0908813476562, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.000825481372885406, + "rewards/margins": -0.00035296427085995674, + "rewards/rejected": 0.0011784456437453628, + "step": 9 + }, + { + "epoch": 0.01996007984031936, + "grad_norm": 7.947789076143285, + "learning_rate": 9.80392156862745e-08, + "logits/chosen": -15.586200714111328, + "logits/rejected": -15.603667259216309, + "logps/chosen": -385.4709777832031, + "logps/rejected": -413.04498291015625, + "loss": 0.6927, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0015043115708976984, + "rewards/margins": -0.0028634597547352314, + "rewards/rejected": 0.001359148183837533, + "step": 10 + }, + { + "epoch": 0.021956087824351298, + "grad_norm": 8.347411190779997, + "learning_rate": 1.0784313725490195e-07, + "logits/chosen": -14.763092994689941, + "logits/rejected": -14.29030990600586, + "logps/chosen": -450.0237731933594, + "logps/rejected": -507.61370849609375, + "loss": 0.6935, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0023990964982658625, + "rewards/margins": -0.003815202508121729, + "rewards/rejected": 0.00141610624268651, + "step": 11 + }, + { + "epoch": 0.023952095808383235, + "grad_norm": 9.064316975676492, + "learning_rate": 1.176470588235294e-07, + "logits/chosen": -15.072341918945312, + "logits/rejected": -14.772862434387207, + "logps/chosen": -335.04266357421875, + "logps/rejected": -371.4832763671875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0030230379197746515, + "rewards/margins": -0.005290627479553223, + "rewards/rejected": 0.0022675893269479275, + "step": 12 + }, + { + "epoch": 0.02594810379241517, + "grad_norm": 7.725863389166651, + "learning_rate": 1.2745098039215685e-07, + "logits/chosen": -16.316631317138672, + "logits/rejected": -16.06757164001465, + "logps/chosen": -419.7452087402344, + "logps/rejected": -390.98773193359375, + "loss": 0.6936, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.001233673538081348, + "rewards/margins": -0.0001909491838887334, + "rewards/rejected": -0.0010427236557006836, + "step": 13 + }, + { + "epoch": 0.027944111776447105, + "grad_norm": 8.42732610297517, + "learning_rate": 1.3725490196078432e-07, + "logits/chosen": -15.35225772857666, + "logits/rejected": -15.612798690795898, + "logps/chosen": -328.98046875, + "logps/rejected": -415.5600280761719, + "loss": 0.6934, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.001978826941922307, + "rewards/margins": 0.00040232675382867455, + "rewards/rejected": 0.001576499780640006, + "step": 14 + }, + { + "epoch": 0.029940119760479042, + "grad_norm": 7.731931977749619, + "learning_rate": 1.4705882352941175e-07, + "logits/chosen": -16.70879554748535, + "logits/rejected": -16.097858428955078, + "logps/chosen": -316.15423583984375, + "logps/rejected": -323.2774353027344, + "loss": 0.6928, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.003133001271635294, + "rewards/margins": -0.0018952846294268966, + "rewards/rejected": -0.0012377167586237192, + "step": 15 + }, + { + "epoch": 0.031936127744510975, + "grad_norm": 7.9669032254132315, + "learning_rate": 1.5686274509803921e-07, + "logits/chosen": -13.231587409973145, + "logits/rejected": -13.3031005859375, + "logps/chosen": -337.7759704589844, + "logps/rejected": -322.94390869140625, + "loss": 0.6933, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0010202788980677724, + "rewards/margins": 0.0009024335886351764, + "rewards/rejected": 0.00011784554226323962, + "step": 16 + }, + { + "epoch": 0.033932135728542916, + "grad_norm": 7.6237992365862075, + "learning_rate": 1.6666666666666665e-07, + "logits/chosen": -14.528105735778809, + "logits/rejected": -14.738598823547363, + "logps/chosen": -241.29954528808594, + "logps/rejected": -247.58921813964844, + "loss": 0.6919, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0020066355355083942, + "rewards/margins": 0.0035532282199710608, + "rewards/rejected": -0.0015465925680473447, + "step": 17 + }, + { + "epoch": 0.03592814371257485, + "grad_norm": 11.366099973012018, + "learning_rate": 1.764705882352941e-07, + "logits/chosen": -15.85158920288086, + "logits/rejected": -15.455657958984375, + "logps/chosen": -346.03350830078125, + "logps/rejected": -324.2177734375, + "loss": 0.6933, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0006118249148130417, + "rewards/margins": -0.0007706499891355634, + "rewards/rejected": 0.00015882489969953895, + "step": 18 + }, + { + "epoch": 0.03792415169660679, + "grad_norm": 7.942061772626445, + "learning_rate": 1.8627450980392158e-07, + "logits/chosen": -14.716241836547852, + "logits/rejected": -14.732061386108398, + "logps/chosen": -283.9400939941406, + "logps/rejected": -269.4703674316406, + "loss": 0.6934, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.00019351489027030766, + "rewards/margins": -0.0023846100084483624, + "rewards/rejected": 0.002191095147281885, + "step": 19 + }, + { + "epoch": 0.03992015968063872, + "grad_norm": 8.146329480525193, + "learning_rate": 1.96078431372549e-07, + "logits/chosen": -13.448728561401367, + "logits/rejected": -13.313655853271484, + "logps/chosen": -375.7843933105469, + "logps/rejected": -317.9215393066406, + "loss": 0.6928, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0009911824017763138, + "rewards/margins": -0.0024417974054813385, + "rewards/rejected": 0.001450614770874381, + "step": 20 + }, + { + "epoch": 0.041916167664670656, + "grad_norm": 8.55196124746064, + "learning_rate": 2.0588235294117645e-07, + "logits/chosen": -15.470209121704102, + "logits/rejected": -15.217321395874023, + "logps/chosen": -299.0631103515625, + "logps/rejected": -327.6717529296875, + "loss": 0.6924, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0023876859340816736, + "rewards/margins": -0.0039983270689845085, + "rewards/rejected": 0.006386012304574251, + "step": 21 + }, + { + "epoch": 0.043912175648702596, + "grad_norm": 8.466171545750429, + "learning_rate": 2.156862745098039e-07, + "logits/chosen": -16.77044677734375, + "logits/rejected": -16.241891860961914, + "logps/chosen": -277.0028076171875, + "logps/rejected": -275.6781005859375, + "loss": 0.6915, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.006396574899554253, + "rewards/margins": 0.0012766977306455374, + "rewards/rejected": 0.005119876936078072, + "step": 22 + }, + { + "epoch": 0.04590818363273453, + "grad_norm": 8.402876138940963, + "learning_rate": 2.2549019607843137e-07, + "logits/chosen": -15.568875312805176, + "logits/rejected": -15.177936553955078, + "logps/chosen": -430.4210510253906, + "logps/rejected": -396.56219482421875, + "loss": 0.6921, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0003438423154875636, + "rewards/margins": 0.0015764283016324043, + "rewards/rejected": -0.001232585753314197, + "step": 23 + }, + { + "epoch": 0.04790419161676647, + "grad_norm": 12.52829981591039, + "learning_rate": 2.352941176470588e-07, + "logits/chosen": -14.164735794067383, + "logits/rejected": -14.398335456848145, + "logps/chosen": -396.947265625, + "logps/rejected": -385.9594421386719, + "loss": 0.6912, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0060534426011145115, + "rewards/margins": 0.0018496987177059054, + "rewards/rejected": 0.004203743766993284, + "step": 24 + }, + { + "epoch": 0.0499001996007984, + "grad_norm": 8.318983318997933, + "learning_rate": 2.4509803921568627e-07, + "logits/chosen": -15.94021987915039, + "logits/rejected": -15.565300941467285, + "logps/chosen": -312.4967346191406, + "logps/rejected": -291.0953369140625, + "loss": 0.6917, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.008275885134935379, + "rewards/margins": 0.006061806343495846, + "rewards/rejected": 0.0022140787914395332, + "step": 25 + }, + { + "epoch": 0.05189620758483034, + "grad_norm": 8.103398791916263, + "learning_rate": 2.549019607843137e-07, + "logits/chosen": -14.540434837341309, + "logits/rejected": -14.473610877990723, + "logps/chosen": -353.2845153808594, + "logps/rejected": -374.34490966796875, + "loss": 0.6915, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.006215238478034735, + "rewards/margins": -4.699244163930416e-05, + "rewards/rejected": 0.006262229755520821, + "step": 26 + }, + { + "epoch": 0.05389221556886228, + "grad_norm": 7.722965287876503, + "learning_rate": 2.6470588235294114e-07, + "logits/chosen": -16.8541202545166, + "logits/rejected": -16.772695541381836, + "logps/chosen": -333.2151184082031, + "logps/rejected": -381.6868591308594, + "loss": 0.6918, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.011300897225737572, + "rewards/margins": 0.00862040463835001, + "rewards/rejected": 0.002680492587387562, + "step": 27 + }, + { + "epoch": 0.05588822355289421, + "grad_norm": 8.21241123134308, + "learning_rate": 2.7450980392156863e-07, + "logits/chosen": -14.656830787658691, + "logits/rejected": -15.223196983337402, + "logps/chosen": -384.7855529785156, + "logps/rejected": -390.5248718261719, + "loss": 0.6912, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005614032968878746, + "rewards/margins": 0.0015372277703136206, + "rewards/rejected": 0.004076804965734482, + "step": 28 + }, + { + "epoch": 0.05788423153692615, + "grad_norm": 8.15008749253195, + "learning_rate": 2.8431372549019607e-07, + "logits/chosen": -15.4053955078125, + "logits/rejected": -15.084259986877441, + "logps/chosen": -397.54937744140625, + "logps/rejected": -373.31109619140625, + "loss": 0.6908, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.009957370348274708, + "rewards/margins": 0.0023600957356393337, + "rewards/rejected": 0.007597275078296661, + "step": 29 + }, + { + "epoch": 0.059880239520958084, + "grad_norm": 8.125634119871668, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -14.482078552246094, + "logits/rejected": -14.186015129089355, + "logps/chosen": -271.9766845703125, + "logps/rejected": -284.2981262207031, + "loss": 0.6909, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.011308508925139904, + "rewards/margins": 0.004111303482204676, + "rewards/rejected": 0.007197204511612654, + "step": 30 + }, + { + "epoch": 0.06187624750499002, + "grad_norm": 7.748089103692524, + "learning_rate": 3.0392156862745094e-07, + "logits/chosen": -15.912099838256836, + "logits/rejected": -15.93221664428711, + "logps/chosen": -304.4377136230469, + "logps/rejected": -315.7114562988281, + "loss": 0.6899, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02117222733795643, + "rewards/margins": 0.006583967246115208, + "rewards/rejected": 0.01458826009184122, + "step": 31 + }, + { + "epoch": 0.06387225548902195, + "grad_norm": 8.060112893776262, + "learning_rate": 3.1372549019607843e-07, + "logits/chosen": -15.711540222167969, + "logits/rejected": -15.569220542907715, + "logps/chosen": -403.3907470703125, + "logps/rejected": -386.66754150390625, + "loss": 0.6901, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0031930492259562016, + "rewards/margins": -0.0004316616104915738, + "rewards/rejected": 0.0036247102543711662, + "step": 32 + }, + { + "epoch": 0.0658682634730539, + "grad_norm": 8.131844895571634, + "learning_rate": 3.2352941176470586e-07, + "logits/chosen": -14.937063217163086, + "logits/rejected": -15.226661682128906, + "logps/chosen": -452.5379638671875, + "logps/rejected": -454.41009521484375, + "loss": 0.6883, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.016051730141043663, + "rewards/margins": 0.009525422938168049, + "rewards/rejected": 0.006526308599859476, + "step": 33 + }, + { + "epoch": 0.06786427145708583, + "grad_norm": 8.419849559119973, + "learning_rate": 3.333333333333333e-07, + "logits/chosen": -15.381107330322266, + "logits/rejected": -15.284709930419922, + "logps/chosen": -401.2351989746094, + "logps/rejected": -390.9241027832031, + "loss": 0.6886, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.024615010246634483, + "rewards/margins": 0.018835801631212234, + "rewards/rejected": 0.005779208615422249, + "step": 34 + }, + { + "epoch": 0.06986027944111776, + "grad_norm": 8.437180869369922, + "learning_rate": 3.431372549019608e-07, + "logits/chosen": -15.466768264770508, + "logits/rejected": -15.167000770568848, + "logps/chosen": -352.6540832519531, + "logps/rejected": -326.68682861328125, + "loss": 0.6875, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.02680317685008049, + "rewards/margins": 0.01718745194375515, + "rewards/rejected": 0.00961572676897049, + "step": 35 + }, + { + "epoch": 0.0718562874251497, + "grad_norm": 7.982258318061745, + "learning_rate": 3.529411764705882e-07, + "logits/chosen": -14.632706642150879, + "logits/rejected": -14.941289901733398, + "logps/chosen": -332.2666015625, + "logps/rejected": -354.9202575683594, + "loss": 0.6871, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.03227221965789795, + "rewards/margins": 0.024501098319888115, + "rewards/rejected": 0.007771119941025972, + "step": 36 + }, + { + "epoch": 0.07385229540918163, + "grad_norm": 8.683716423285436, + "learning_rate": 3.6274509803921566e-07, + "logits/chosen": -16.10747528076172, + "logits/rejected": -15.692405700683594, + "logps/chosen": -350.07916259765625, + "logps/rejected": -328.48040771484375, + "loss": 0.6879, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.026277001947164536, + "rewards/margins": 0.0028891037218272686, + "rewards/rejected": 0.023387901484966278, + "step": 37 + }, + { + "epoch": 0.07584830339321358, + "grad_norm": 8.263903944711704, + "learning_rate": 3.7254901960784315e-07, + "logits/chosen": -16.594873428344727, + "logits/rejected": -16.079730987548828, + "logps/chosen": -308.6328125, + "logps/rejected": -305.5195617675781, + "loss": 0.6857, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03257057070732117, + "rewards/margins": 0.01241181418299675, + "rewards/rejected": 0.020158756524324417, + "step": 38 + }, + { + "epoch": 0.07784431137724551, + "grad_norm": 8.511057638069643, + "learning_rate": 3.8235294117647053e-07, + "logits/chosen": -14.873018264770508, + "logits/rejected": -14.647686004638672, + "logps/chosen": -309.0533447265625, + "logps/rejected": -321.43011474609375, + "loss": 0.6828, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.036281879991292953, + "rewards/margins": 0.025011887773871422, + "rewards/rejected": 0.011269993148744106, + "step": 39 + }, + { + "epoch": 0.07984031936127745, + "grad_norm": 8.338517669067725, + "learning_rate": 3.92156862745098e-07, + "logits/chosen": -15.299338340759277, + "logits/rejected": -15.163370132446289, + "logps/chosen": -302.93212890625, + "logps/rejected": -345.40545654296875, + "loss": 0.685, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.044032178819179535, + "rewards/margins": 0.024382634088397026, + "rewards/rejected": 0.01964954286813736, + "step": 40 + }, + { + "epoch": 0.08183632734530938, + "grad_norm": 8.07172349335598, + "learning_rate": 4.019607843137255e-07, + "logits/chosen": -15.369956016540527, + "logits/rejected": -15.433903694152832, + "logps/chosen": -409.83984375, + "logps/rejected": -412.54180908203125, + "loss": 0.6853, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.035033755004405975, + "rewards/margins": 0.01923045516014099, + "rewards/rejected": 0.015803297981619835, + "step": 41 + }, + { + "epoch": 0.08383233532934131, + "grad_norm": 7.920156576030204, + "learning_rate": 4.117647058823529e-07, + "logits/chosen": -14.02873420715332, + "logits/rejected": -13.986605644226074, + "logps/chosen": -354.6033935546875, + "logps/rejected": -359.84014892578125, + "loss": 0.6864, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.034912168979644775, + "rewards/margins": 0.009367440827190876, + "rewards/rejected": 0.025544727221131325, + "step": 42 + }, + { + "epoch": 0.08582834331337326, + "grad_norm": 8.232720229409079, + "learning_rate": 4.215686274509804e-07, + "logits/chosen": -15.009214401245117, + "logits/rejected": -15.307815551757812, + "logps/chosen": -380.186767578125, + "logps/rejected": -415.4902648925781, + "loss": 0.6858, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.018880976364016533, + "rewards/margins": 0.010581063106656075, + "rewards/rejected": 0.008299913257360458, + "step": 43 + }, + { + "epoch": 0.08782435129740519, + "grad_norm": 8.533444238200468, + "learning_rate": 4.313725490196078e-07, + "logits/chosen": -14.22335433959961, + "logits/rejected": -14.953871726989746, + "logps/chosen": -347.45574951171875, + "logps/rejected": -389.9525451660156, + "loss": 0.6835, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.01024315319955349, + "rewards/margins": -0.00465776864439249, + "rewards/rejected": 0.014900922775268555, + "step": 44 + }, + { + "epoch": 0.08982035928143713, + "grad_norm": 8.038118181406828, + "learning_rate": 4.4117647058823526e-07, + "logits/chosen": -14.101947784423828, + "logits/rejected": -13.89334774017334, + "logps/chosen": -317.0509033203125, + "logps/rejected": -323.69183349609375, + "loss": 0.6817, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.026917992159724236, + "rewards/margins": 0.008457997813820839, + "rewards/rejected": 0.018459992483258247, + "step": 45 + }, + { + "epoch": 0.09181636726546906, + "grad_norm": 8.149720896485583, + "learning_rate": 4.5098039215686274e-07, + "logits/chosen": -14.710037231445312, + "logits/rejected": -14.176923751831055, + "logps/chosen": -445.82379150390625, + "logps/rejected": -488.3534851074219, + "loss": 0.6809, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.023077696561813354, + "rewards/margins": 0.04250966012477875, + "rewards/rejected": -0.019431961700320244, + "step": 46 + }, + { + "epoch": 0.09381237524950099, + "grad_norm": 7.992010994151138, + "learning_rate": 4.6078431372549013e-07, + "logits/chosen": -15.482625961303711, + "logits/rejected": -14.013190269470215, + "logps/chosen": -367.063232421875, + "logps/rejected": -342.14471435546875, + "loss": 0.6774, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0394100584089756, + "rewards/margins": 0.02275776118040085, + "rewards/rejected": 0.016652297228574753, + "step": 47 + }, + { + "epoch": 0.09580838323353294, + "grad_norm": 7.756878381863422, + "learning_rate": 4.705882352941176e-07, + "logits/chosen": -15.568811416625977, + "logits/rejected": -15.423860549926758, + "logps/chosen": -433.0845031738281, + "logps/rejected": -439.37396240234375, + "loss": 0.6776, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0388474203646183, + "rewards/margins": 0.03113992139697075, + "rewards/rejected": 0.007707500830292702, + "step": 48 + }, + { + "epoch": 0.09780439121756487, + "grad_norm": 8.493689270125792, + "learning_rate": 4.803921568627451e-07, + "logits/chosen": -14.434328079223633, + "logits/rejected": -14.237007141113281, + "logps/chosen": -375.27874755859375, + "logps/rejected": -351.34783935546875, + "loss": 0.6807, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.031233904883265495, + "rewards/margins": 0.011667889542877674, + "rewards/rejected": 0.019566014409065247, + "step": 49 + }, + { + "epoch": 0.0998003992015968, + "grad_norm": 7.9896349499421735, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": -15.668423652648926, + "logits/rejected": -15.05298900604248, + "logps/chosen": -307.7395935058594, + "logps/rejected": -320.8984375, + "loss": 0.6748, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06957457959651947, + "rewards/margins": 0.025473352521657944, + "rewards/rejected": 0.044101230800151825, + "step": 50 + }, + { + "epoch": 0.10179640718562874, + "grad_norm": 8.021742888341015, + "learning_rate": 5e-07, + "logits/chosen": -14.430184364318848, + "logits/rejected": -14.525110244750977, + "logps/chosen": -258.0986022949219, + "logps/rejected": -283.91607666015625, + "loss": 0.6743, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.04039158299565315, + "rewards/margins": 0.033271849155426025, + "rewards/rejected": 0.007119735702872276, + "step": 51 + }, + { + "epoch": 0.10379241516966067, + "grad_norm": 8.431429641964243, + "learning_rate": 4.999939076763486e-07, + "logits/chosen": -15.636438369750977, + "logits/rejected": -15.311294555664062, + "logps/chosen": -344.984619140625, + "logps/rejected": -323.0438232421875, + "loss": 0.6688, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.09328603744506836, + "rewards/margins": 0.04766825586557388, + "rewards/rejected": 0.04561777785420418, + "step": 52 + }, + { + "epoch": 0.10578842315369262, + "grad_norm": 8.486118751253972, + "learning_rate": 4.99975631002326e-07, + "logits/chosen": -15.692873001098633, + "logits/rejected": -15.621683120727539, + "logps/chosen": -294.3231201171875, + "logps/rejected": -324.92559814453125, + "loss": 0.6684, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.08665186166763306, + "rewards/margins": 0.0698593482375145, + "rewards/rejected": 0.016792509704828262, + "step": 53 + }, + { + "epoch": 0.10778443113772455, + "grad_norm": 8.584995513046005, + "learning_rate": 4.999451708687113e-07, + "logits/chosen": -13.536327362060547, + "logits/rejected": -14.043939590454102, + "logps/chosen": -316.51129150390625, + "logps/rejected": -340.39251708984375, + "loss": 0.6677, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.04377901926636696, + "rewards/margins": 0.07115273177623749, + "rewards/rejected": -0.027373716235160828, + "step": 54 + }, + { + "epoch": 0.10978043912175649, + "grad_norm": 8.410801379788628, + "learning_rate": 4.999025287600885e-07, + "logits/chosen": -15.176254272460938, + "logits/rejected": -14.865735054016113, + "logps/chosen": -347.4294128417969, + "logps/rejected": -362.15496826171875, + "loss": 0.6672, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0652116909623146, + "rewards/margins": 0.05398234352469444, + "rewards/rejected": 0.01122935302555561, + "step": 55 + }, + { + "epoch": 0.11177644710578842, + "grad_norm": 9.191123619800617, + "learning_rate": 4.998477067547739e-07, + "logits/chosen": -14.366271018981934, + "logits/rejected": -13.491727828979492, + "logps/chosen": -294.74420166015625, + "logps/rejected": -313.5377502441406, + "loss": 0.6695, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08104420453310013, + "rewards/margins": 0.06518866121768951, + "rewards/rejected": 0.015855543315410614, + "step": 56 + }, + { + "epoch": 0.11377245508982035, + "grad_norm": 8.36989537276007, + "learning_rate": 4.997807075247145e-07, + "logits/chosen": -15.414962768554688, + "logits/rejected": -15.15987777709961, + "logps/chosen": -329.4682312011719, + "logps/rejected": -355.2568054199219, + "loss": 0.6699, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05671892687678337, + "rewards/margins": 0.05990144982933998, + "rewards/rejected": -0.0031825248152017593, + "step": 57 + }, + { + "epoch": 0.1157684630738523, + "grad_norm": 7.963033814703697, + "learning_rate": 4.997015343353585e-07, + "logits/chosen": -15.05243968963623, + "logits/rejected": -15.26134967803955, + "logps/chosen": -407.4922790527344, + "logps/rejected": -400.91033935546875, + "loss": 0.6667, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.044123757630586624, + "rewards/margins": 0.07534614205360413, + "rewards/rejected": -0.03122239001095295, + "step": 58 + }, + { + "epoch": 0.11776447105788423, + "grad_norm": 7.537729687084787, + "learning_rate": 4.996101910454953e-07, + "logits/chosen": -14.969869613647461, + "logits/rejected": -14.01164436340332, + "logps/chosen": -338.7287292480469, + "logps/rejected": -342.40234375, + "loss": 0.6676, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02124447375535965, + "rewards/margins": 0.0819094106554985, + "rewards/rejected": -0.060664933174848557, + "step": 59 + }, + { + "epoch": 0.11976047904191617, + "grad_norm": 9.279313627982006, + "learning_rate": 4.995066821070679e-07, + "logits/chosen": -13.303523063659668, + "logits/rejected": -14.182500839233398, + "logps/chosen": -362.7261047363281, + "logps/rejected": -333.9490051269531, + "loss": 0.6709, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.003293365240097046, + "rewards/margins": 0.09314459562301636, + "rewards/rejected": -0.08985123038291931, + "step": 60 + }, + { + "epoch": 0.1217564870259481, + "grad_norm": 8.252068285364153, + "learning_rate": 4.99391012564956e-07, + "logits/chosen": -17.23371124267578, + "logits/rejected": -16.242280960083008, + "logps/chosen": -367.806396484375, + "logps/rejected": -337.6153259277344, + "loss": 0.6547, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03489462658762932, + "rewards/margins": 0.1051090657711029, + "rewards/rejected": -0.07021445780992508, + "step": 61 + }, + { + "epoch": 0.12375249500998003, + "grad_norm": 8.15636533439587, + "learning_rate": 4.9926318805673e-07, + "logits/chosen": -15.824554443359375, + "logits/rejected": -15.663161277770996, + "logps/chosen": -282.3473815917969, + "logps/rejected": -311.18994140625, + "loss": 0.6597, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09248015284538269, + "rewards/margins": 0.09078236669301987, + "rewards/rejected": 0.0016977828927338123, + "step": 62 + }, + { + "epoch": 0.12574850299401197, + "grad_norm": 8.910924596109384, + "learning_rate": 4.991232148123761e-07, + "logits/chosen": -16.677001953125, + "logits/rejected": -16.39942741394043, + "logps/chosen": -460.5334167480469, + "logps/rejected": -422.064453125, + "loss": 0.6681, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.06818778812885284, + "rewards/margins": 0.01459517702460289, + "rewards/rejected": -0.08278295397758484, + "step": 63 + }, + { + "epoch": 0.1277445109780439, + "grad_norm": 8.580401141003072, + "learning_rate": 4.989710996539925e-07, + "logits/chosen": -15.317991256713867, + "logits/rejected": -15.26541519165039, + "logps/chosen": -424.9542541503906, + "logps/rejected": -399.70989990234375, + "loss": 0.6564, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08943880349397659, + "rewards/margins": 0.0476187989115715, + "rewards/rejected": -0.1370576024055481, + "step": 64 + }, + { + "epoch": 0.12974051896207583, + "grad_norm": 8.429082566678625, + "learning_rate": 4.988068499954577e-07, + "logits/chosen": -16.077041625976562, + "logits/rejected": -15.868209838867188, + "logps/chosen": -316.7906494140625, + "logps/rejected": -335.0377197265625, + "loss": 0.6581, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.016306515783071518, + "rewards/margins": 0.07164555788040161, + "rewards/rejected": -0.08795207738876343, + "step": 65 + }, + { + "epoch": 0.1317365269461078, + "grad_norm": 8.133889266001447, + "learning_rate": 4.986304738420683e-07, + "logits/chosen": -15.085855484008789, + "logits/rejected": -14.784677505493164, + "logps/chosen": -300.3466796875, + "logps/rejected": -314.3912353515625, + "loss": 0.6578, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04595138877630234, + "rewards/margins": 0.06701233983039856, + "rewards/rejected": -0.021060939878225327, + "step": 66 + }, + { + "epoch": 0.13373253493013973, + "grad_norm": 8.721591720129195, + "learning_rate": 4.984419797901491e-07, + "logits/chosen": -15.171606063842773, + "logits/rejected": -15.278976440429688, + "logps/chosen": -465.72027587890625, + "logps/rejected": -483.4525146484375, + "loss": 0.6438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07418551295995712, + "rewards/margins": 0.11256247758865356, + "rewards/rejected": -0.18674799799919128, + "step": 67 + }, + { + "epoch": 0.13572854291417166, + "grad_norm": 8.689476143706786, + "learning_rate": 4.982413770266342e-07, + "logits/chosen": -16.235498428344727, + "logits/rejected": -15.38123893737793, + "logps/chosen": -363.73419189453125, + "logps/rejected": -350.5956726074219, + "loss": 0.6554, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09112317860126495, + "rewards/margins": 0.10634519904851913, + "rewards/rejected": -0.19746838510036469, + "step": 68 + }, + { + "epoch": 0.1377245508982036, + "grad_norm": 9.320906157232027, + "learning_rate": 4.980286753286194e-07, + "logits/chosen": -14.806767463684082, + "logits/rejected": -15.064220428466797, + "logps/chosen": -229.83612060546875, + "logps/rejected": -257.00848388671875, + "loss": 0.649, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08728949725627899, + "rewards/margins": 0.05588501691818237, + "rewards/rejected": -0.14317449927330017, + "step": 69 + }, + { + "epoch": 0.13972055888223553, + "grad_norm": 8.45599582084097, + "learning_rate": 4.978038850628853e-07, + "logits/chosen": -15.640983581542969, + "logits/rejected": -15.810898780822754, + "logps/chosen": -403.06884765625, + "logps/rejected": -411.408203125, + "loss": 0.6461, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07357379794120789, + "rewards/margins": 0.10018520057201385, + "rewards/rejected": -0.17375899851322174, + "step": 70 + }, + { + "epoch": 0.14171656686626746, + "grad_norm": 8.397438393746963, + "learning_rate": 4.975670171853925e-07, + "logits/chosen": -15.794336318969727, + "logits/rejected": -15.847979545593262, + "logps/chosen": -379.279296875, + "logps/rejected": -363.3298645019531, + "loss": 0.6501, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.12070687115192413, + "rewards/margins": 0.0922447144985199, + "rewards/rejected": -0.21295160055160522, + "step": 71 + }, + { + "epoch": 0.1437125748502994, + "grad_norm": 8.604521034039575, + "learning_rate": 4.973180832407471e-07, + "logits/chosen": -14.604567527770996, + "logits/rejected": -14.695852279663086, + "logps/chosen": -345.739990234375, + "logps/rejected": -436.0314636230469, + "loss": 0.6454, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07075143605470657, + "rewards/margins": 0.16379314661026, + "rewards/rejected": -0.23454459011554718, + "step": 72 + }, + { + "epoch": 0.14570858283433133, + "grad_norm": 8.431953705553173, + "learning_rate": 4.970570953616382e-07, + "logits/chosen": -14.503868103027344, + "logits/rejected": -15.155458450317383, + "logps/chosen": -326.4110107421875, + "logps/rejected": -384.87322998046875, + "loss": 0.648, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15915606915950775, + "rewards/margins": 0.1893479824066162, + "rewards/rejected": -0.34850406646728516, + "step": 73 + }, + { + "epoch": 0.14770459081836326, + "grad_norm": 8.881932037242414, + "learning_rate": 4.96784066268247e-07, + "logits/chosen": -13.779004096984863, + "logits/rejected": -13.429590225219727, + "logps/chosen": -291.79296875, + "logps/rejected": -296.4447937011719, + "loss": 0.643, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.17777323722839355, + "rewards/margins": 0.02108706906437874, + "rewards/rejected": -0.1988603174686432, + "step": 74 + }, + { + "epoch": 0.1497005988023952, + "grad_norm": 8.314122527140741, + "learning_rate": 4.964990092676262e-07, + "logits/chosen": -17.725143432617188, + "logits/rejected": -17.505762100219727, + "logps/chosen": -341.1331787109375, + "logps/rejected": -350.88116455078125, + "loss": 0.6369, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.1543130725622177, + "rewards/margins": 0.12310568988323212, + "rewards/rejected": -0.27741876244544983, + "step": 75 + }, + { + "epoch": 0.15169660678642716, + "grad_norm": 8.48077117653257, + "learning_rate": 4.96201938253052e-07, + "logits/chosen": -16.73549461364746, + "logits/rejected": -16.362672805786133, + "logps/chosen": -395.6476135253906, + "logps/rejected": -469.1103515625, + "loss": 0.6252, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19351297616958618, + "rewards/margins": 0.18316538631916046, + "rewards/rejected": -0.37667837738990784, + "step": 76 + }, + { + "epoch": 0.1536926147704591, + "grad_norm": 8.393604155470431, + "learning_rate": 4.958928677033465e-07, + "logits/chosen": -15.707889556884766, + "logits/rejected": -15.537942886352539, + "logps/chosen": -281.0392761230469, + "logps/rejected": -319.1485900878906, + "loss": 0.629, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06948637962341309, + "rewards/margins": 0.21688398718833923, + "rewards/rejected": -0.28637033700942993, + "step": 77 + }, + { + "epoch": 0.15568862275449102, + "grad_norm": 9.067089889668104, + "learning_rate": 4.955718126821722e-07, + "logits/chosen": -16.561952590942383, + "logits/rejected": -15.844078063964844, + "logps/chosen": -364.40185546875, + "logps/rejected": -347.0330505371094, + "loss": 0.628, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.16794858872890472, + "rewards/margins": 0.05846191197633743, + "rewards/rejected": -0.22641049325466156, + "step": 78 + }, + { + "epoch": 0.15768463073852296, + "grad_norm": 8.940066268140832, + "learning_rate": 4.952387888372978e-07, + "logits/chosen": -15.177964210510254, + "logits/rejected": -15.126307487487793, + "logps/chosen": -411.2934265136719, + "logps/rejected": -388.7569274902344, + "loss": 0.6447, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2629951238632202, + "rewards/margins": 0.11192844063043594, + "rewards/rejected": -0.37492355704307556, + "step": 79 + }, + { + "epoch": 0.1596806387225549, + "grad_norm": 8.869063942219517, + "learning_rate": 4.94893812399836e-07, + "logits/chosen": -15.601551055908203, + "logits/rejected": -16.042139053344727, + "logps/chosen": -344.1158752441406, + "logps/rejected": -439.0846862792969, + "loss": 0.6223, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.15164640545845032, + "rewards/margins": 0.31529542803764343, + "rewards/rejected": -0.46694183349609375, + "step": 80 + }, + { + "epoch": 0.16167664670658682, + "grad_norm": 9.779541683117705, + "learning_rate": 4.945369001834514e-07, + "logits/chosen": -16.39380645751953, + "logits/rejected": -15.366002082824707, + "logps/chosen": -427.91278076171875, + "logps/rejected": -421.6066589355469, + "loss": 0.6294, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.22397363185882568, + "rewards/margins": 0.07985258847475052, + "rewards/rejected": -0.3038262128829956, + "step": 81 + }, + { + "epoch": 0.16367265469061876, + "grad_norm": 9.681655379739604, + "learning_rate": 4.941680695835419e-07, + "logits/chosen": -16.988998413085938, + "logits/rejected": -16.312742233276367, + "logps/chosen": -392.65618896484375, + "logps/rejected": -409.4969787597656, + "loss": 0.6472, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3562784492969513, + "rewards/margins": 0.05219919979572296, + "rewards/rejected": -0.40847766399383545, + "step": 82 + }, + { + "epoch": 0.1656686626746507, + "grad_norm": 9.018644132426164, + "learning_rate": 4.937873385763907e-07, + "logits/chosen": -18.049556732177734, + "logits/rejected": -16.927623748779297, + "logps/chosen": -329.4657287597656, + "logps/rejected": -301.9615783691406, + "loss": 0.617, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3110809922218323, + "rewards/margins": -0.030765339732170105, + "rewards/rejected": -0.280315637588501, + "step": 83 + }, + { + "epoch": 0.16766467065868262, + "grad_norm": 9.744502049109531, + "learning_rate": 4.9339472571829e-07, + "logits/chosen": -16.83614730834961, + "logits/rejected": -15.979333877563477, + "logps/chosen": -302.4591979980469, + "logps/rejected": -308.2034606933594, + "loss": 0.6372, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029211895540356636, + "rewards/margins": 0.14211447536945343, + "rewards/rejected": -0.17132636904716492, + "step": 84 + }, + { + "epoch": 0.16966067864271456, + "grad_norm": 10.062959151859104, + "learning_rate": 4.929902501446366e-07, + "logits/chosen": -16.791458129882812, + "logits/rejected": -16.60369300842285, + "logps/chosen": -301.0384521484375, + "logps/rejected": -344.17340087890625, + "loss": 0.6229, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.21959218382835388, + "rewards/margins": 0.10680700838565826, + "rewards/rejected": -0.32639920711517334, + "step": 85 + }, + { + "epoch": 0.17165668662674652, + "grad_norm": 8.899211999721798, + "learning_rate": 4.925739315689991e-07, + "logits/chosen": -17.578712463378906, + "logits/rejected": -17.100988388061523, + "logps/chosen": -392.40399169921875, + "logps/rejected": -408.375244140625, + "loss": 0.5926, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3149687647819519, + "rewards/margins": 0.22621186077594757, + "rewards/rejected": -0.5411806106567383, + "step": 86 + }, + { + "epoch": 0.17365269461077845, + "grad_norm": 9.058158084220052, + "learning_rate": 4.921457902821578e-07, + "logits/chosen": -18.370519638061523, + "logits/rejected": -18.149328231811523, + "logps/chosen": -506.34832763671875, + "logps/rejected": -465.40228271484375, + "loss": 0.6228, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5572055578231812, + "rewards/margins": 0.24167928099632263, + "rewards/rejected": -0.7988848686218262, + "step": 87 + }, + { + "epoch": 0.17564870259481039, + "grad_norm": 9.924143893065036, + "learning_rate": 4.917058471511148e-07, + "logits/chosen": -17.789403915405273, + "logits/rejected": -17.73256492614746, + "logps/chosen": -473.0888977050781, + "logps/rejected": -505.4393310546875, + "loss": 0.622, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.562219500541687, + "rewards/margins": 0.14176039397716522, + "rewards/rejected": -0.7039799690246582, + "step": 88 + }, + { + "epoch": 0.17764471057884232, + "grad_norm": 9.455855889835503, + "learning_rate": 4.912541236180778e-07, + "logits/chosen": -15.555730819702148, + "logits/rejected": -16.45915985107422, + "logps/chosen": -359.06304931640625, + "logps/rejected": -448.340087890625, + "loss": 0.6132, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.1561770737171173, + "rewards/margins": 0.43770644068717957, + "rewards/rejected": -0.5938835144042969, + "step": 89 + }, + { + "epoch": 0.17964071856287425, + "grad_norm": 9.198885333259362, + "learning_rate": 4.907906416994145e-07, + "logits/chosen": -16.712560653686523, + "logits/rejected": -15.981279373168945, + "logps/chosen": -379.13067626953125, + "logps/rejected": -461.3275451660156, + "loss": 0.621, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15404972434043884, + "rewards/margins": 0.31564387679100037, + "rewards/rejected": -0.4696936309337616, + "step": 90 + }, + { + "epoch": 0.18163672654690619, + "grad_norm": 9.490922115703798, + "learning_rate": 4.903154239845797e-07, + "logits/chosen": -16.743335723876953, + "logits/rejected": -16.130474090576172, + "logps/chosen": -375.4258728027344, + "logps/rejected": -382.15667724609375, + "loss": 0.6204, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.46173471212387085, + "rewards/margins": 0.16935935616493225, + "rewards/rejected": -0.6310940980911255, + "step": 91 + }, + { + "epoch": 0.18363273453093812, + "grad_norm": 9.888840866130433, + "learning_rate": 4.898284936350143e-07, + "logits/chosen": -15.539291381835938, + "logits/rejected": -15.749015808105469, + "logps/chosen": -375.1903991699219, + "logps/rejected": -397.3068542480469, + "loss": 0.6055, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3286465108394623, + "rewards/margins": 0.2511574327945709, + "rewards/rejected": -0.5798039436340332, + "step": 92 + }, + { + "epoch": 0.18562874251497005, + "grad_norm": 9.534547072466589, + "learning_rate": 4.893298743830167e-07, + "logits/chosen": -17.950407028198242, + "logits/rejected": -17.723539352416992, + "logps/chosen": -570.85693359375, + "logps/rejected": -546.0053100585938, + "loss": 0.6082, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6157274842262268, + "rewards/margins": 0.40511518716812134, + "rewards/rejected": -1.0208425521850586, + "step": 93 + }, + { + "epoch": 0.18762475049900199, + "grad_norm": 9.722881859804074, + "learning_rate": 4.888195905305859e-07, + "logits/chosen": -17.128742218017578, + "logits/rejected": -16.86199951171875, + "logps/chosen": -364.7801513671875, + "logps/rejected": -412.03985595703125, + "loss": 0.6212, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35856327414512634, + "rewards/margins": 0.058156758546829224, + "rewards/rejected": -0.4167200028896332, + "step": 94 + }, + { + "epoch": 0.18962075848303392, + "grad_norm": 9.687485216578887, + "learning_rate": 4.882976669482367e-07, + "logits/chosen": -16.509841918945312, + "logits/rejected": -17.33835220336914, + "logps/chosen": -401.19586181640625, + "logps/rejected": -437.3283386230469, + "loss": 0.6115, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.44053059816360474, + "rewards/margins": 0.5870952606201172, + "rewards/rejected": -1.0276257991790771, + "step": 95 + }, + { + "epoch": 0.19161676646706588, + "grad_norm": 9.298831520670507, + "learning_rate": 4.877641290737883e-07, + "logits/chosen": -16.204208374023438, + "logits/rejected": -16.041362762451172, + "logps/chosen": -377.23699951171875, + "logps/rejected": -422.1312561035156, + "loss": 0.594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2721441388130188, + "rewards/margins": 0.31349363923072815, + "rewards/rejected": -0.5856378078460693, + "step": 96 + }, + { + "epoch": 0.1936127744510978, + "grad_norm": 11.267444543343759, + "learning_rate": 4.872190029111241e-07, + "logits/chosen": -17.558420181274414, + "logits/rejected": -17.034278869628906, + "logps/chosen": -502.9958190917969, + "logps/rejected": -536.4631958007812, + "loss": 0.593, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6253632307052612, + "rewards/margins": 0.29862356185913086, + "rewards/rejected": -0.9239866733551025, + "step": 97 + }, + { + "epoch": 0.19560878243512975, + "grad_norm": 9.47144132073207, + "learning_rate": 4.866623150289241e-07, + "logits/chosen": -17.21906852722168, + "logits/rejected": -16.40659523010254, + "logps/chosen": -308.1177062988281, + "logps/rejected": -365.7485656738281, + "loss": 0.5896, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20060190558433533, + "rewards/margins": 0.40862858295440674, + "rewards/rejected": -0.6092304587364197, + "step": 98 + }, + { + "epoch": 0.19760479041916168, + "grad_norm": 11.742965565854453, + "learning_rate": 4.860940925593702e-07, + "logits/chosen": -18.529356002807617, + "logits/rejected": -18.07916259765625, + "logps/chosen": -302.5202331542969, + "logps/rejected": -296.69964599609375, + "loss": 0.566, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18639332056045532, + "rewards/margins": 0.23985722661018372, + "rewards/rejected": -0.42625054717063904, + "step": 99 + }, + { + "epoch": 0.1996007984031936, + "grad_norm": 9.838131303202186, + "learning_rate": 4.855143631968242e-07, + "logits/chosen": -16.533798217773438, + "logits/rejected": -17.409114837646484, + "logps/chosen": -452.28173828125, + "logps/rejected": -523.7857055664062, + "loss": 0.576, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.42187410593032837, + "rewards/margins": 0.3787775933742523, + "rewards/rejected": -0.8006517887115479, + "step": 100 + }, + { + "epoch": 0.20159680638722555, + "grad_norm": 9.784365965262033, + "learning_rate": 4.849231551964771e-07, + "logits/chosen": -17.89838218688965, + "logits/rejected": -17.24924087524414, + "logps/chosen": -389.7870178222656, + "logps/rejected": -441.84857177734375, + "loss": 0.6049, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5550696849822998, + "rewards/margins": 0.31255972385406494, + "rewards/rejected": -0.86762934923172, + "step": 101 + }, + { + "epoch": 0.20359281437125748, + "grad_norm": 10.254231166703345, + "learning_rate": 4.843204973729728e-07, + "logits/chosen": -17.67897605895996, + "logits/rejected": -16.87425994873047, + "logps/chosen": -350.6794128417969, + "logps/rejected": -377.69073486328125, + "loss": 0.5967, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4413577914237976, + "rewards/margins": 0.24204562604427338, + "rewards/rejected": -0.6834034323692322, + "step": 102 + }, + { + "epoch": 0.2055888223552894, + "grad_norm": 10.177731859208695, + "learning_rate": 4.837064190990036e-07, + "logits/chosen": -17.95716094970703, + "logits/rejected": -18.56848907470703, + "logps/chosen": -363.04754638671875, + "logps/rejected": -426.90582275390625, + "loss": 0.5939, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4591674506664276, + "rewards/margins": 0.3569161593914032, + "rewards/rejected": -0.8160836100578308, + "step": 103 + }, + { + "epoch": 0.20758483033932135, + "grad_norm": 11.665978228628967, + "learning_rate": 4.830809503038781e-07, + "logits/chosen": -18.042129516601562, + "logits/rejected": -18.134384155273438, + "logps/chosen": -371.7164611816406, + "logps/rejected": -375.3335876464844, + "loss": 0.6234, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6777195334434509, + "rewards/margins": 0.03881584107875824, + "rewards/rejected": -0.7165352702140808, + "step": 104 + }, + { + "epoch": 0.20958083832335328, + "grad_norm": 10.307421231401023, + "learning_rate": 4.824441214720628e-07, + "logits/chosen": -17.154827117919922, + "logits/rejected": -16.737272262573242, + "logps/chosen": -415.67706298828125, + "logps/rejected": -465.8774719238281, + "loss": 0.5871, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7462588548660278, + "rewards/margins": 0.246952623128891, + "rewards/rejected": -0.9932115077972412, + "step": 105 + }, + { + "epoch": 0.21157684630738524, + "grad_norm": 10.785145631272416, + "learning_rate": 4.817959636416969e-07, + "logits/chosen": -16.967071533203125, + "logits/rejected": -17.13130760192871, + "logps/chosen": -345.673095703125, + "logps/rejected": -378.43780517578125, + "loss": 0.5532, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4800865054130554, + "rewards/margins": 0.3331327438354492, + "rewards/rejected": -0.8132193684577942, + "step": 106 + }, + { + "epoch": 0.21357285429141717, + "grad_norm": 10.599087532304829, + "learning_rate": 4.811365084030783e-07, + "logits/chosen": -15.638040542602539, + "logits/rejected": -16.557126998901367, + "logps/chosen": -444.3518371582031, + "logps/rejected": -541.2994995117188, + "loss": 0.5506, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5941789150238037, + "rewards/margins": 0.6498495936393738, + "rewards/rejected": -1.2440284490585327, + "step": 107 + }, + { + "epoch": 0.2155688622754491, + "grad_norm": 11.345714476429055, + "learning_rate": 4.804657878971251e-07, + "logits/chosen": -17.96299934387207, + "logits/rejected": -18.472097396850586, + "logps/chosen": -431.6180114746094, + "logps/rejected": -497.6167297363281, + "loss": 0.5714, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7720993757247925, + "rewards/margins": 0.20353072881698608, + "rewards/rejected": -0.9756300449371338, + "step": 108 + }, + { + "epoch": 0.21756487025948104, + "grad_norm": 10.98517018534807, + "learning_rate": 4.797838348138086e-07, + "logits/chosen": -16.523500442504883, + "logits/rejected": -16.56661605834961, + "logps/chosen": -433.0921325683594, + "logps/rejected": -492.59112548828125, + "loss": 0.5571, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8152545690536499, + "rewards/margins": 0.39452555775642395, + "rewards/rejected": -1.2097800970077515, + "step": 109 + }, + { + "epoch": 0.21956087824351297, + "grad_norm": 10.837354651005112, + "learning_rate": 4.790906823905599e-07, + "logits/chosen": -17.36256980895996, + "logits/rejected": -16.92915153503418, + "logps/chosen": -410.4229431152344, + "logps/rejected": -412.46435546875, + "loss": 0.5787, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5949603319168091, + "rewards/margins": 0.22644329071044922, + "rewards/rejected": -0.8214036226272583, + "step": 110 + }, + { + "epoch": 0.2215568862275449, + "grad_norm": 10.320787241206938, + "learning_rate": 4.783863644106502e-07, + "logits/chosen": -16.421478271484375, + "logits/rejected": -16.64776039123535, + "logps/chosen": -483.7657165527344, + "logps/rejected": -481.812744140625, + "loss": 0.5564, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6031550765037537, + "rewards/margins": 0.19912101328372955, + "rewards/rejected": -0.8022760152816772, + "step": 111 + }, + { + "epoch": 0.22355289421157684, + "grad_norm": 11.107038600485746, + "learning_rate": 4.776709152015442e-07, + "logits/chosen": -18.22406005859375, + "logits/rejected": -18.18329429626465, + "logps/chosen": -334.51348876953125, + "logps/rejected": -383.4963684082031, + "loss": 0.598, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5061721205711365, + "rewards/margins": 0.23278102278709412, + "rewards/rejected": -0.7389531135559082, + "step": 112 + }, + { + "epoch": 0.22554890219560877, + "grad_norm": 10.862434019726457, + "learning_rate": 4.769443696332272e-07, + "logits/chosen": -16.520822525024414, + "logits/rejected": -17.432579040527344, + "logps/chosen": -359.88568115234375, + "logps/rejected": -433.949462890625, + "loss": 0.5686, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4676639437675476, + "rewards/margins": 0.6773457527160645, + "rewards/rejected": -1.1450097560882568, + "step": 113 + }, + { + "epoch": 0.2275449101796407, + "grad_norm": 10.633885886185016, + "learning_rate": 4.762067631165049e-07, + "logits/chosen": -16.13068962097168, + "logits/rejected": -16.44469451904297, + "logps/chosen": -433.1568298339844, + "logps/rejected": -488.2666931152344, + "loss": 0.5095, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5838625431060791, + "rewards/margins": 0.51032954454422, + "rewards/rejected": -1.0941921472549438, + "step": 114 + }, + { + "epoch": 0.22954091816367264, + "grad_norm": 11.311189673193253, + "learning_rate": 4.7545813160127845e-07, + "logits/chosen": -16.38141632080078, + "logits/rejected": -17.378694534301758, + "logps/chosen": -542.2471313476562, + "logps/rejected": -644.93115234375, + "loss": 0.5264, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7417050004005432, + "rewards/margins": 1.1188050508499146, + "rewards/rejected": -1.860509991645813, + "step": 115 + }, + { + "epoch": 0.2315369261477046, + "grad_norm": 11.760352337564447, + "learning_rate": 4.746985115747917e-07, + "logits/chosen": -17.631946563720703, + "logits/rejected": -18.053573608398438, + "logps/chosen": -447.10943603515625, + "logps/rejected": -478.54083251953125, + "loss": 0.5718, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8603832125663757, + "rewards/margins": 0.30809980630874634, + "rewards/rejected": -1.168483018875122, + "step": 116 + }, + { + "epoch": 0.23353293413173654, + "grad_norm": 13.092392132476672, + "learning_rate": 4.739279400598532e-07, + "logits/chosen": -17.92958641052246, + "logits/rejected": -17.64777946472168, + "logps/chosen": -566.686279296875, + "logps/rejected": -626.3263549804688, + "loss": 0.5499, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8065224885940552, + "rewards/margins": 0.33996978402137756, + "rewards/rejected": -1.1464921236038208, + "step": 117 + }, + { + "epoch": 0.23552894211576847, + "grad_norm": 11.695233651191389, + "learning_rate": 4.731464546130314e-07, + "logits/chosen": -18.524166107177734, + "logits/rejected": -17.927783966064453, + "logps/chosen": -480.1812744140625, + "logps/rejected": -493.7731628417969, + "loss": 0.5755, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9772423505783081, + "rewards/margins": 0.1880900263786316, + "rewards/rejected": -1.1653324365615845, + "step": 118 + }, + { + "epoch": 0.2375249500998004, + "grad_norm": 14.981531373880904, + "learning_rate": 4.7235409332282436e-07, + "logits/chosen": -18.190269470214844, + "logits/rejected": -18.184982299804688, + "logps/chosen": -418.0079345703125, + "logps/rejected": -410.58599853515625, + "loss": 0.5681, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8476977944374084, + "rewards/margins": 0.24571648240089417, + "rewards/rejected": -1.093414306640625, + "step": 119 + }, + { + "epoch": 0.23952095808383234, + "grad_norm": 12.8327990641794, + "learning_rate": 4.7155089480780365e-07, + "logits/chosen": -17.30999183654785, + "logits/rejected": -17.51451301574707, + "logps/chosen": -473.12420654296875, + "logps/rejected": -552.434814453125, + "loss": 0.5624, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9688943028450012, + "rewards/margins": 0.5823672413825989, + "rewards/rejected": -1.5512614250183105, + "step": 120 + }, + { + "epoch": 0.24151696606786427, + "grad_norm": 12.280307782693178, + "learning_rate": 4.707368982147317e-07, + "logits/chosen": -17.097469329833984, + "logits/rejected": -16.843915939331055, + "logps/chosen": -452.419189453125, + "logps/rejected": -520.343505859375, + "loss": 0.505, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6861757040023804, + "rewards/margins": 0.5018269419670105, + "rewards/rejected": -1.188002586364746, + "step": 121 + }, + { + "epoch": 0.2435129740518962, + "grad_norm": 12.97976538848426, + "learning_rate": 4.6991214321665414e-07, + "logits/chosen": -17.63509750366211, + "logits/rejected": -17.495311737060547, + "logps/chosen": -431.20465087890625, + "logps/rejected": -465.6344299316406, + "loss": 0.5491, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7622246742248535, + "rewards/margins": 0.2962070107460022, + "rewards/rejected": -1.0584317445755005, + "step": 122 + }, + { + "epoch": 0.24550898203592814, + "grad_norm": 11.496195773328525, + "learning_rate": 4.6907667001096585e-07, + "logits/chosen": -17.77838897705078, + "logits/rejected": -18.234045028686523, + "logps/chosen": -474.1757507324219, + "logps/rejected": -698.40478515625, + "loss": 0.5397, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8501319289207458, + "rewards/margins": 1.1107265949249268, + "rewards/rejected": -1.9608584642410278, + "step": 123 + }, + { + "epoch": 0.24750499001996007, + "grad_norm": 12.44934209996877, + "learning_rate": 4.6823051931745237e-07, + "logits/chosen": -18.704727172851562, + "logits/rejected": -18.47835350036621, + "logps/chosen": -354.03076171875, + "logps/rejected": -454.5083312988281, + "loss": 0.5607, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5832939147949219, + "rewards/margins": 0.8197604417800903, + "rewards/rejected": -1.4030543565750122, + "step": 124 + }, + { + "epoch": 0.249500998003992, + "grad_norm": 11.46676444609218, + "learning_rate": 4.6737373237630473e-07, + "logits/chosen": -17.123502731323242, + "logits/rejected": -17.59217071533203, + "logps/chosen": -421.08380126953125, + "logps/rejected": -530.366455078125, + "loss": 0.5211, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0707979202270508, + "rewards/margins": 0.6613157391548157, + "rewards/rejected": -1.7321135997772217, + "step": 125 + }, + { + "epoch": 0.25149700598802394, + "grad_norm": 12.183293436350857, + "learning_rate": 4.6650635094610966e-07, + "logits/chosen": -18.38361358642578, + "logits/rejected": -18.456607818603516, + "logps/chosen": -464.5298156738281, + "logps/rejected": -517.3018798828125, + "loss": 0.5149, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0340361595153809, + "rewards/margins": 0.34383463859558105, + "rewards/rejected": -1.3778706789016724, + "step": 126 + }, + { + "epoch": 0.25349301397205587, + "grad_norm": 11.999072355533364, + "learning_rate": 4.6562841730181435e-07, + "logits/chosen": -19.123811721801758, + "logits/rejected": -18.296850204467773, + "logps/chosen": -476.5393371582031, + "logps/rejected": -521.8916015625, + "loss": 0.5389, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2099838256835938, + "rewards/margins": 0.3222024738788605, + "rewards/rejected": -1.532186508178711, + "step": 127 + }, + { + "epoch": 0.2554890219560878, + "grad_norm": 12.045125886996352, + "learning_rate": 4.647399742326661e-07, + "logits/chosen": -17.34141731262207, + "logits/rejected": -17.46158218383789, + "logps/chosen": -414.46832275390625, + "logps/rejected": -451.8553466796875, + "loss": 0.5338, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9311251640319824, + "rewards/margins": 0.434777170419693, + "rewards/rejected": -1.365902304649353, + "step": 128 + }, + { + "epoch": 0.25748502994011974, + "grad_norm": 11.977860936951346, + "learning_rate": 4.6384106504012665e-07, + "logits/chosen": -17.99986457824707, + "logits/rejected": -17.715444564819336, + "logps/chosen": -358.08837890625, + "logps/rejected": -396.7147521972656, + "loss": 0.4943, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7580198049545288, + "rewards/margins": 0.34733161330223083, + "rewards/rejected": -1.105351448059082, + "step": 129 + }, + { + "epoch": 0.25948103792415167, + "grad_norm": 11.874157690404008, + "learning_rate": 4.6293173353576186e-07, + "logits/chosen": -19.319414138793945, + "logits/rejected": -19.246450424194336, + "logps/chosen": -460.84991455078125, + "logps/rejected": -561.8507690429688, + "loss": 0.5235, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8761596083641052, + "rewards/margins": 0.9677722454071045, + "rewards/rejected": -1.843931794166565, + "step": 130 + }, + { + "epoch": 0.26147704590818366, + "grad_norm": 11.642857128737496, + "learning_rate": 4.6201202403910643e-07, + "logits/chosen": -18.393766403198242, + "logits/rejected": -18.55428695678711, + "logps/chosen": -434.67999267578125, + "logps/rejected": -488.51824951171875, + "loss": 0.5162, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6767091751098633, + "rewards/margins": 0.5004065632820129, + "rewards/rejected": -1.1771156787872314, + "step": 131 + }, + { + "epoch": 0.2634730538922156, + "grad_norm": 11.491974226815117, + "learning_rate": 4.6108198137550377e-07, + "logits/chosen": -19.24776268005371, + "logits/rejected": -18.658472061157227, + "logps/chosen": -440.7890625, + "logps/rejected": -503.10595703125, + "loss": 0.4921, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9644001722335815, + "rewards/margins": 0.6125253438949585, + "rewards/rejected": -1.57692551612854, + "step": 132 + }, + { + "epoch": 0.2654690618762475, + "grad_norm": 12.899140291726132, + "learning_rate": 4.6014165087392105e-07, + "logits/chosen": -19.86013412475586, + "logits/rejected": -19.67055320739746, + "logps/chosen": -401.9028625488281, + "logps/rejected": -435.5128479003906, + "loss": 0.5094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9691441655158997, + "rewards/margins": 0.3254424035549164, + "rewards/rejected": -1.294586420059204, + "step": 133 + }, + { + "epoch": 0.26746506986027946, + "grad_norm": 16.439092593676218, + "learning_rate": 4.591910783647404e-07, + "logits/chosen": -18.951297760009766, + "logits/rejected": -19.075408935546875, + "logps/chosen": -498.13165283203125, + "logps/rejected": -586.1142578125, + "loss": 0.5117, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8596202731132507, + "rewards/margins": 0.8309850096702576, + "rewards/rejected": -1.6906054019927979, + "step": 134 + }, + { + "epoch": 0.2694610778443114, + "grad_norm": 16.957511057498678, + "learning_rate": 4.582303101775248e-07, + "logits/chosen": -18.295085906982422, + "logits/rejected": -17.897132873535156, + "logps/chosen": -457.41119384765625, + "logps/rejected": -517.1851196289062, + "loss": 0.5318, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8927091956138611, + "rewards/margins": 0.5656099319458008, + "rewards/rejected": -1.458319067955017, + "step": 135 + }, + { + "epoch": 0.2714570858283433, + "grad_norm": 12.506389333337408, + "learning_rate": 4.572593931387604e-07, + "logits/chosen": -18.31269073486328, + "logits/rejected": -18.61883544921875, + "logps/chosen": -440.43206787109375, + "logps/rejected": -573.1978759765625, + "loss": 0.4875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9546418190002441, + "rewards/margins": 0.8689519762992859, + "rewards/rejected": -1.8235938549041748, + "step": 136 + }, + { + "epoch": 0.27345309381237526, + "grad_norm": 14.756504766621427, + "learning_rate": 4.5627837456957374e-07, + "logits/chosen": -17.93488121032715, + "logits/rejected": -18.0272274017334, + "logps/chosen": -484.2279052734375, + "logps/rejected": -520.9596557617188, + "loss": 0.5817, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9371182322502136, + "rewards/margins": 0.3456663191318512, + "rewards/rejected": -1.2827844619750977, + "step": 137 + }, + { + "epoch": 0.2754491017964072, + "grad_norm": 13.948355377700631, + "learning_rate": 4.55287302283426e-07, + "logits/chosen": -18.253450393676758, + "logits/rejected": -17.966075897216797, + "logps/chosen": -420.6814880371094, + "logps/rejected": -508.517333984375, + "loss": 0.5273, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.033136010169983, + "rewards/margins": 0.7523062825202942, + "rewards/rejected": -1.7854422330856323, + "step": 138 + }, + { + "epoch": 0.2774451097804391, + "grad_norm": 12.827093040680634, + "learning_rate": 4.542862245837821e-07, + "logits/chosen": -18.383729934692383, + "logits/rejected": -18.069950103759766, + "logps/chosen": -398.27105712890625, + "logps/rejected": -538.0073852539062, + "loss": 0.4845, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9848905205726624, + "rewards/margins": 0.9015528559684753, + "rewards/rejected": -1.8864431381225586, + "step": 139 + }, + { + "epoch": 0.27944111776447106, + "grad_norm": 16.782793626489322, + "learning_rate": 4.5327519026175686e-07, + "logits/chosen": -19.082073211669922, + "logits/rejected": -18.443805694580078, + "logps/chosen": -403.085693359375, + "logps/rejected": -444.4017639160156, + "loss": 0.5388, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9073647260665894, + "rewards/margins": 0.35679662227630615, + "rewards/rejected": -1.264161467552185, + "step": 140 + }, + { + "epoch": 0.281437125748503, + "grad_norm": 11.997391966133737, + "learning_rate": 4.5225424859373684e-07, + "logits/chosen": -18.931427001953125, + "logits/rejected": -18.6968936920166, + "logps/chosen": -479.6912841796875, + "logps/rejected": -543.6511840820312, + "loss": 0.5032, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1790649890899658, + "rewards/margins": 0.5665948987007141, + "rewards/rejected": -1.7456599473953247, + "step": 141 + }, + { + "epoch": 0.2834331337325349, + "grad_norm": 16.73284107773462, + "learning_rate": 4.512234493389785e-07, + "logits/chosen": -18.50774383544922, + "logits/rejected": -17.77086067199707, + "logps/chosen": -400.50604248046875, + "logps/rejected": -467.09259033203125, + "loss": 0.5364, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9760237336158752, + "rewards/margins": 0.4449756145477295, + "rewards/rejected": -1.42099928855896, + "step": 142 + }, + { + "epoch": 0.28542914171656686, + "grad_norm": 14.572499779841467, + "learning_rate": 4.501828427371833e-07, + "logits/chosen": -17.65532684326172, + "logits/rejected": -18.490184783935547, + "logps/chosen": -442.3606872558594, + "logps/rejected": -481.2702941894531, + "loss": 0.5458, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.434018850326538, + "rewards/margins": 0.32890772819519043, + "rewards/rejected": -1.7629268169403076, + "step": 143 + }, + { + "epoch": 0.2874251497005988, + "grad_norm": 12.077196392810203, + "learning_rate": 4.4913247950604903e-07, + "logits/chosen": -17.562110900878906, + "logits/rejected": -18.58519744873047, + "logps/chosen": -514.4218139648438, + "logps/rejected": -602.9424438476562, + "loss": 0.5105, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.461451768875122, + "rewards/margins": 0.6788410544395447, + "rewards/rejected": -2.1402928829193115, + "step": 144 + }, + { + "epoch": 0.2894211576846307, + "grad_norm": 12.508567044518502, + "learning_rate": 4.4807241083879764e-07, + "logits/chosen": -17.865825653076172, + "logits/rejected": -17.985769271850586, + "logps/chosen": -434.7288513183594, + "logps/rejected": -482.5139465332031, + "loss": 0.5639, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4301695823669434, + "rewards/margins": 0.34268102049827576, + "rewards/rejected": -1.772850751876831, + "step": 145 + }, + { + "epoch": 0.29141716566866266, + "grad_norm": 14.199498491697744, + "learning_rate": 4.470026884016804e-07, + "logits/chosen": -19.102584838867188, + "logits/rejected": -19.021583557128906, + "logps/chosen": -458.40802001953125, + "logps/rejected": -512.4900512695312, + "loss": 0.524, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1264287233352661, + "rewards/margins": 0.5926605463027954, + "rewards/rejected": -1.7190892696380615, + "step": 146 + }, + { + "epoch": 0.2934131736526946, + "grad_norm": 11.094319848631054, + "learning_rate": 4.459233643314599e-07, + "logits/chosen": -17.470544815063477, + "logits/rejected": -17.389942169189453, + "logps/chosen": -371.8180236816406, + "logps/rejected": -449.5108947753906, + "loss": 0.4906, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0187515020370483, + "rewards/margins": 0.49795636534690857, + "rewards/rejected": -1.5167080163955688, + "step": 147 + }, + { + "epoch": 0.2954091816367265, + "grad_norm": 11.94286172302262, + "learning_rate": 4.4483449123286855e-07, + "logits/chosen": -17.679344177246094, + "logits/rejected": -18.112321853637695, + "logps/chosen": -568.5883178710938, + "logps/rejected": -588.3480834960938, + "loss": 0.5598, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4919590950012207, + "rewards/margins": 0.22017580270767212, + "rewards/rejected": -1.7121349573135376, + "step": 148 + }, + { + "epoch": 0.29740518962075846, + "grad_norm": 13.921819284966194, + "learning_rate": 4.437361221760449e-07, + "logits/chosen": -19.270662307739258, + "logits/rejected": -19.816879272460938, + "logps/chosen": -478.0981140136719, + "logps/rejected": -570.726318359375, + "loss": 0.4884, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8832843899726868, + "rewards/margins": 0.8125098347663879, + "rewards/rejected": -1.6957943439483643, + "step": 149 + }, + { + "epoch": 0.2994011976047904, + "grad_norm": 13.963951037655912, + "learning_rate": 4.426283106939473e-07, + "logits/chosen": -18.801692962646484, + "logits/rejected": -18.507511138916016, + "logps/chosen": -404.40924072265625, + "logps/rejected": -461.0819091796875, + "loss": 0.5169, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9502855539321899, + "rewards/margins": 0.5388966798782349, + "rewards/rejected": -1.4891822338104248, + "step": 150 + }, + { + "epoch": 0.3013972055888224, + "grad_norm": 11.751403000253367, + "learning_rate": 4.415111107797445e-07, + "logits/chosen": -17.666833877563477, + "logits/rejected": -18.496967315673828, + "logps/chosen": -437.1878967285156, + "logps/rejected": -508.55517578125, + "loss": 0.4824, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1933588981628418, + "rewards/margins": 0.5103777050971985, + "rewards/rejected": -1.703736662864685, + "step": 151 + }, + { + "epoch": 0.3033932135728543, + "grad_norm": 15.664706001428366, + "learning_rate": 4.403845768841842e-07, + "logits/chosen": -19.15441131591797, + "logits/rejected": -19.0245418548584, + "logps/chosen": -517.4494018554688, + "logps/rejected": -583.9824829101562, + "loss": 0.5042, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1839749813079834, + "rewards/margins": 0.7368112206459045, + "rewards/rejected": -1.9207862615585327, + "step": 152 + }, + { + "epoch": 0.30538922155688625, + "grad_norm": 12.964369532636645, + "learning_rate": 4.392487639129391e-07, + "logits/chosen": -17.844913482666016, + "logits/rejected": -18.007299423217773, + "logps/chosen": -430.0899353027344, + "logps/rejected": -520.009033203125, + "loss": 0.4851, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9588636755943298, + "rewards/margins": 0.9035049080848694, + "rewards/rejected": -1.8623687028884888, + "step": 153 + }, + { + "epoch": 0.3073852295409182, + "grad_norm": 12.352390828674043, + "learning_rate": 4.3810372722393106e-07, + "logits/chosen": -18.35409927368164, + "logits/rejected": -18.485759735107422, + "logps/chosen": -433.47747802734375, + "logps/rejected": -456.7944030761719, + "loss": 0.508, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0077626705169678, + "rewards/margins": 0.3574880361557007, + "rewards/rejected": -1.3652507066726685, + "step": 154 + }, + { + "epoch": 0.3093812375249501, + "grad_norm": 12.60101053011718, + "learning_rate": 4.36949522624633e-07, + "logits/chosen": -18.486366271972656, + "logits/rejected": -18.509090423583984, + "logps/chosen": -511.802734375, + "logps/rejected": -631.6337890625, + "loss": 0.483, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3301135301589966, + "rewards/margins": 0.8965414762496948, + "rewards/rejected": -2.2266550064086914, + "step": 155 + }, + { + "epoch": 0.31137724550898205, + "grad_norm": 12.076316766290963, + "learning_rate": 4.357862063693485e-07, + "logits/chosen": -18.479555130004883, + "logits/rejected": -18.734378814697266, + "logps/chosen": -412.6598815917969, + "logps/rejected": -505.7449951171875, + "loss": 0.4914, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.813165545463562, + "rewards/margins": 0.7614642381668091, + "rewards/rejected": -1.574629783630371, + "step": 156 + }, + { + "epoch": 0.313373253493014, + "grad_norm": 14.618551602047281, + "learning_rate": 4.34613835156471e-07, + "logits/chosen": -18.515804290771484, + "logits/rejected": -18.076066970825195, + "logps/chosen": -518.2728271484375, + "logps/rejected": -609.0603637695312, + "loss": 0.475, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4048892259597778, + "rewards/margins": 0.7887292504310608, + "rewards/rejected": -2.1936185359954834, + "step": 157 + }, + { + "epoch": 0.3153692614770459, + "grad_norm": 14.08049532394846, + "learning_rate": 4.3343246612571905e-07, + "logits/chosen": -18.047651290893555, + "logits/rejected": -18.692232131958008, + "logps/chosen": -412.463623046875, + "logps/rejected": -513.392822265625, + "loss": 0.4961, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0158097743988037, + "rewards/margins": 0.7373302578926086, + "rewards/rejected": -1.753139853477478, + "step": 158 + }, + { + "epoch": 0.31736526946107785, + "grad_norm": 12.549764922498149, + "learning_rate": 4.3224215685535287e-07, + "logits/chosen": -18.217510223388672, + "logits/rejected": -18.150440216064453, + "logps/chosen": -432.1754455566406, + "logps/rejected": -474.58477783203125, + "loss": 0.4799, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9655523896217346, + "rewards/margins": 0.6841700673103333, + "rewards/rejected": -1.6497223377227783, + "step": 159 + }, + { + "epoch": 0.3193612774451098, + "grad_norm": 19.934375633780295, + "learning_rate": 4.310429653593669e-07, + "logits/chosen": -19.979171752929688, + "logits/rejected": -20.37800407409668, + "logps/chosen": -465.45416259765625, + "logps/rejected": -549.4246826171875, + "loss": 0.5618, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0206208229064941, + "rewards/margins": 0.7802413702011108, + "rewards/rejected": -1.800862193107605, + "step": 160 + }, + { + "epoch": 0.3213572854291417, + "grad_norm": 14.01791810960553, + "learning_rate": 4.2983495008466273e-07, + "logits/chosen": -19.8514404296875, + "logits/rejected": -19.740142822265625, + "logps/chosen": -557.4789428710938, + "logps/rejected": -551.2369995117188, + "loss": 0.4997, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4763171672821045, + "rewards/margins": 0.34967875480651855, + "rewards/rejected": -1.825995922088623, + "step": 161 + }, + { + "epoch": 0.32335329341317365, + "grad_norm": 12.51564581989701, + "learning_rate": 4.286181699082008e-07, + "logits/chosen": -19.269309997558594, + "logits/rejected": -19.268407821655273, + "logps/chosen": -471.971923828125, + "logps/rejected": -514.5819702148438, + "loss": 0.4595, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1184983253479004, + "rewards/margins": 0.3826027512550354, + "rewards/rejected": -1.5011011362075806, + "step": 162 + }, + { + "epoch": 0.3253493013972056, + "grad_norm": 15.542357689555216, + "learning_rate": 4.273926841341302e-07, + "logits/chosen": -20.759143829345703, + "logits/rejected": -20.3962345123291, + "logps/chosen": -425.91998291015625, + "logps/rejected": -551.6376953125, + "loss": 0.4915, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8230355978012085, + "rewards/margins": 1.2822701930999756, + "rewards/rejected": -2.1053059101104736, + "step": 163 + }, + { + "epoch": 0.3273453093812375, + "grad_norm": 11.931760390211124, + "learning_rate": 4.2615855249089867e-07, + "logits/chosen": -19.185287475585938, + "logits/rejected": -19.267969131469727, + "logps/chosen": -475.9891662597656, + "logps/rejected": -579.8348388671875, + "loss": 0.5064, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2812844514846802, + "rewards/margins": 0.9683108329772949, + "rewards/rejected": -2.2495951652526855, + "step": 164 + }, + { + "epoch": 0.32934131736526945, + "grad_norm": 14.325662785483841, + "learning_rate": 4.249158351283413e-07, + "logits/chosen": -18.518449783325195, + "logits/rejected": -18.800861358642578, + "logps/chosen": -423.35260009765625, + "logps/rejected": -591.2802734375, + "loss": 0.4822, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.156442642211914, + "rewards/margins": 1.339384913444519, + "rewards/rejected": -2.4958276748657227, + "step": 165 + }, + { + "epoch": 0.3313373253493014, + "grad_norm": 13.209121169799278, + "learning_rate": 4.236645926147493e-07, + "logits/chosen": -18.23113250732422, + "logits/rejected": -17.86396598815918, + "logps/chosen": -509.241943359375, + "logps/rejected": -584.48193359375, + "loss": 0.5113, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3406342267990112, + "rewards/margins": 0.4660688638687134, + "rewards/rejected": -1.8067032098770142, + "step": 166 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 12.679678337512888, + "learning_rate": 4.224048859339174e-07, + "logits/chosen": -20.044212341308594, + "logits/rejected": -19.84770965576172, + "logps/chosen": -474.1540222167969, + "logps/rejected": -576.6797485351562, + "loss": 0.4782, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5084277391433716, + "rewards/margins": 0.8898066878318787, + "rewards/rejected": -2.3982343673706055, + "step": 167 + }, + { + "epoch": 0.33532934131736525, + "grad_norm": 12.803928092289635, + "learning_rate": 4.2113677648217216e-07, + "logits/chosen": -19.095563888549805, + "logits/rejected": -18.801067352294922, + "logps/chosen": -410.49737548828125, + "logps/rejected": -484.6048278808594, + "loss": 0.4629, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1591637134552002, + "rewards/margins": 0.666135847568512, + "rewards/rejected": -1.8252995014190674, + "step": 168 + }, + { + "epoch": 0.3373253493013972, + "grad_norm": 14.174173721204978, + "learning_rate": 4.1986032606537916e-07, + "logits/chosen": -16.58905792236328, + "logits/rejected": -16.84238624572754, + "logps/chosen": -569.0613403320312, + "logps/rejected": -614.7280883789062, + "loss": 0.5008, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4528391361236572, + "rewards/margins": 0.6195281744003296, + "rewards/rejected": -2.0723674297332764, + "step": 169 + }, + { + "epoch": 0.3393213572854291, + "grad_norm": 13.080053722751183, + "learning_rate": 4.1857559689593083e-07, + "logits/chosen": -18.27602767944336, + "logits/rejected": -17.57232666015625, + "logps/chosen": -453.475830078125, + "logps/rejected": -515.5191040039062, + "loss": 0.4821, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3552886247634888, + "rewards/margins": 0.832838773727417, + "rewards/rejected": -2.1881275177001953, + "step": 170 + }, + { + "epoch": 0.3413173652694611, + "grad_norm": 12.364971785532722, + "learning_rate": 4.172826515897145e-07, + "logits/chosen": -18.919944763183594, + "logits/rejected": -19.389076232910156, + "logps/chosen": -388.2543029785156, + "logps/rejected": -470.0887451171875, + "loss": 0.4563, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9409610033035278, + "rewards/margins": 0.7529804706573486, + "rewards/rejected": -1.6939414739608765, + "step": 171 + }, + { + "epoch": 0.34331337325349304, + "grad_norm": 13.246390983471949, + "learning_rate": 4.1598155316306037e-07, + "logits/chosen": -18.05150032043457, + "logits/rejected": -18.103609085083008, + "logps/chosen": -473.530029296875, + "logps/rejected": -540.929443359375, + "loss": 0.4981, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.117943525314331, + "rewards/margins": 0.5308763384819031, + "rewards/rejected": -1.6488198041915894, + "step": 172 + }, + { + "epoch": 0.34530938123752497, + "grad_norm": 13.04681058875418, + "learning_rate": 4.146723650296701e-07, + "logits/chosen": -20.524463653564453, + "logits/rejected": -20.365554809570312, + "logps/chosen": -426.28448486328125, + "logps/rejected": -516.8076782226562, + "loss": 0.5333, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.146423101425171, + "rewards/margins": 0.9261961579322815, + "rewards/rejected": -2.0726191997528076, + "step": 173 + }, + { + "epoch": 0.3473053892215569, + "grad_norm": 13.558902213534159, + "learning_rate": 4.133551509975264e-07, + "logits/chosen": -18.120737075805664, + "logits/rejected": -17.530546188354492, + "logps/chosen": -374.5181884765625, + "logps/rejected": -488.8734130859375, + "loss": 0.478, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8048744201660156, + "rewards/margins": 1.0980815887451172, + "rewards/rejected": -1.9029561281204224, + "step": 174 + }, + { + "epoch": 0.34930139720558884, + "grad_norm": 12.399640912045163, + "learning_rate": 4.120299752657827e-07, + "logits/chosen": -20.241302490234375, + "logits/rejected": -19.89463233947754, + "logps/chosen": -435.4852294921875, + "logps/rejected": -600.3646240234375, + "loss": 0.4833, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0055052042007446, + "rewards/margins": 1.095568299293518, + "rewards/rejected": -2.1010735034942627, + "step": 175 + }, + { + "epoch": 0.35129740518962077, + "grad_norm": 18.67991630316, + "learning_rate": 4.106969024216348e-07, + "logits/chosen": -20.45185661315918, + "logits/rejected": -20.16585922241211, + "logps/chosen": -495.82159423828125, + "logps/rejected": -615.032958984375, + "loss": 0.5435, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4768478870391846, + "rewards/margins": 0.9564065933227539, + "rewards/rejected": -2.4332542419433594, + "step": 176 + }, + { + "epoch": 0.3532934131736527, + "grad_norm": 14.433114513707654, + "learning_rate": 4.0935599743717244e-07, + "logits/chosen": -19.229232788085938, + "logits/rejected": -19.95961570739746, + "logps/chosen": -481.91278076171875, + "logps/rejected": -587.878662109375, + "loss": 0.478, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3701449632644653, + "rewards/margins": 1.1796942949295044, + "rewards/rejected": -2.549839496612549, + "step": 177 + }, + { + "epoch": 0.35528942115768464, + "grad_norm": 13.188342082066123, + "learning_rate": 4.080073256662127e-07, + "logits/chosen": -18.213958740234375, + "logits/rejected": -19.102828979492188, + "logps/chosen": -580.2965087890625, + "logps/rejected": -703.9386596679688, + "loss": 0.4952, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4226490259170532, + "rewards/margins": 1.1172631978988647, + "rewards/rejected": -2.539912223815918, + "step": 178 + }, + { + "epoch": 0.35728542914171657, + "grad_norm": 13.727220401466234, + "learning_rate": 4.066509528411151e-07, + "logits/chosen": -19.163558959960938, + "logits/rejected": -20.03252601623535, + "logps/chosen": -416.621826171875, + "logps/rejected": -556.1578979492188, + "loss": 0.4664, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9182752370834351, + "rewards/margins": 1.0827980041503906, + "rewards/rejected": -2.001073122024536, + "step": 179 + }, + { + "epoch": 0.3592814371257485, + "grad_norm": 13.429269056852185, + "learning_rate": 4.0528694506957754e-07, + "logits/chosen": -19.609607696533203, + "logits/rejected": -19.0802059173584, + "logps/chosen": -463.4999694824219, + "logps/rejected": -571.7774047851562, + "loss": 0.4641, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1410191059112549, + "rewards/margins": 0.9015007019042969, + "rewards/rejected": -2.0425198078155518, + "step": 180 + }, + { + "epoch": 0.36127744510978044, + "grad_norm": 13.035911390029865, + "learning_rate": 4.039153688314145e-07, + "logits/chosen": -19.555572509765625, + "logits/rejected": -19.886554718017578, + "logps/chosen": -429.10003662109375, + "logps/rejected": -526.60888671875, + "loss": 0.4332, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4029077291488647, + "rewards/margins": 0.8520881533622742, + "rewards/rejected": -2.254995822906494, + "step": 181 + }, + { + "epoch": 0.36327345309381237, + "grad_norm": 15.508960229044732, + "learning_rate": 4.025362909753169e-07, + "logits/chosen": -18.01523208618164, + "logits/rejected": -18.14191246032715, + "logps/chosen": -425.21063232421875, + "logps/rejected": -510.28546142578125, + "loss": 0.4844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3480522632598877, + "rewards/margins": 0.6659371852874756, + "rewards/rejected": -2.0139894485473633, + "step": 182 + }, + { + "epoch": 0.3652694610778443, + "grad_norm": 16.333189353880638, + "learning_rate": 4.0114977871559377e-07, + "logits/chosen": -19.889019012451172, + "logits/rejected": -20.158649444580078, + "logps/chosen": -388.4324645996094, + "logps/rejected": -496.0431823730469, + "loss": 0.502, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3997801542282104, + "rewards/margins": 0.8247696161270142, + "rewards/rejected": -2.2245497703552246, + "step": 183 + }, + { + "epoch": 0.36726546906187624, + "grad_norm": 12.698608735181692, + "learning_rate": 3.997558996288964e-07, + "logits/chosen": -19.303394317626953, + "logits/rejected": -18.998262405395508, + "logps/chosen": -573.5017700195312, + "logps/rejected": -647.8325805664062, + "loss": 0.4686, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7628663778305054, + "rewards/margins": 0.7538273334503174, + "rewards/rejected": -2.5166938304901123, + "step": 184 + }, + { + "epoch": 0.36926147704590817, + "grad_norm": 14.891574400285474, + "learning_rate": 3.983547216509254e-07, + "logits/chosen": -19.312734603881836, + "logits/rejected": -19.04522132873535, + "logps/chosen": -529.5467529296875, + "logps/rejected": -636.4886474609375, + "loss": 0.4513, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.68673574924469, + "rewards/margins": 0.7229608297348022, + "rewards/rejected": -2.409696578979492, + "step": 185 + }, + { + "epoch": 0.3712574850299401, + "grad_norm": 14.363890009536421, + "learning_rate": 3.9694631307311825e-07, + "logits/chosen": -20.22901153564453, + "logits/rejected": -19.24595832824707, + "logps/chosen": -568.1968994140625, + "logps/rejected": -576.79052734375, + "loss": 0.4763, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8447073698043823, + "rewards/margins": 0.5339704751968384, + "rewards/rejected": -2.3786778450012207, + "step": 186 + }, + { + "epoch": 0.37325349301397204, + "grad_norm": 26.393758306411566, + "learning_rate": 3.9553074253932233e-07, + "logits/chosen": -19.26047134399414, + "logits/rejected": -19.362327575683594, + "logps/chosen": -527.1236572265625, + "logps/rejected": -585.4468994140625, + "loss": 0.5022, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4415079355239868, + "rewards/margins": 0.6697221994400024, + "rewards/rejected": -2.1112301349639893, + "step": 187 + }, + { + "epoch": 0.37524950099800397, + "grad_norm": 13.852299099165053, + "learning_rate": 3.941080790424483e-07, + "logits/chosen": -19.18405532836914, + "logits/rejected": -19.502866744995117, + "logps/chosen": -496.4898376464844, + "logps/rejected": -598.0606079101562, + "loss": 0.5085, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5746562480926514, + "rewards/margins": 0.7243920564651489, + "rewards/rejected": -2.2990481853485107, + "step": 188 + }, + { + "epoch": 0.3772455089820359, + "grad_norm": 12.900703996378226, + "learning_rate": 3.9267839192110797e-07, + "logits/chosen": -18.6906795501709, + "logits/rejected": -19.030370712280273, + "logps/chosen": -507.127197265625, + "logps/rejected": -523.0045776367188, + "loss": 0.4568, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4412367343902588, + "rewards/margins": 0.3045232892036438, + "rewards/rejected": -1.7457599639892578, + "step": 189 + }, + { + "epoch": 0.37924151696606784, + "grad_norm": 14.68383696725854, + "learning_rate": 3.912417508562345e-07, + "logits/chosen": -19.418498992919922, + "logits/rejected": -19.330608367919922, + "logps/chosen": -421.99456787109375, + "logps/rejected": -503.4772033691406, + "loss": 0.4865, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.183841347694397, + "rewards/margins": 0.7099670171737671, + "rewards/rejected": -1.8938082456588745, + "step": 190 + }, + { + "epoch": 0.3812375249500998, + "grad_norm": 11.75575189656368, + "learning_rate": 3.8979822586768666e-07, + "logits/chosen": -21.16440200805664, + "logits/rejected": -20.485124588012695, + "logps/chosen": -514.5684204101562, + "logps/rejected": -571.970703125, + "loss": 0.4889, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4731186628341675, + "rewards/margins": 0.6870366930961609, + "rewards/rejected": -2.1601555347442627, + "step": 191 + }, + { + "epoch": 0.38323353293413176, + "grad_norm": 14.448756790216004, + "learning_rate": 3.88347887310836e-07, + "logits/chosen": -20.147769927978516, + "logits/rejected": -19.916521072387695, + "logps/chosen": -447.382080078125, + "logps/rejected": -597.453125, + "loss": 0.457, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4994301795959473, + "rewards/margins": 1.272138237953186, + "rewards/rejected": -2.771568536758423, + "step": 192 + }, + { + "epoch": 0.3852295409181637, + "grad_norm": 12.23374123752803, + "learning_rate": 3.8689080587313755e-07, + "logits/chosen": -19.599998474121094, + "logits/rejected": -19.84237289428711, + "logps/chosen": -470.7962646484375, + "logps/rejected": -576.8146362304688, + "loss": 0.4514, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.35133695602417, + "rewards/margins": 0.9762927293777466, + "rewards/rejected": -2.327629804611206, + "step": 193 + }, + { + "epoch": 0.3872255489021956, + "grad_norm": 14.698946134402291, + "learning_rate": 3.85427052570685e-07, + "logits/chosen": -19.12531280517578, + "logits/rejected": -18.746444702148438, + "logps/chosen": -468.0195007324219, + "logps/rejected": -533.82177734375, + "loss": 0.483, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.414006233215332, + "rewards/margins": 0.6618885397911072, + "rewards/rejected": -2.075894832611084, + "step": 194 + }, + { + "epoch": 0.38922155688622756, + "grad_norm": 15.811852032735173, + "learning_rate": 3.839566987447491e-07, + "logits/chosen": -19.964893341064453, + "logits/rejected": -19.358760833740234, + "logps/chosen": -411.2624816894531, + "logps/rejected": -456.06390380859375, + "loss": 0.4831, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0391451120376587, + "rewards/margins": 0.6880395412445068, + "rewards/rejected": -1.7271846532821655, + "step": 195 + }, + { + "epoch": 0.3912175648702595, + "grad_norm": 12.305213877004382, + "learning_rate": 3.824798160583012e-07, + "logits/chosen": -19.77585220336914, + "logits/rejected": -19.821577072143555, + "logps/chosen": -391.9801025390625, + "logps/rejected": -514.4705810546875, + "loss": 0.4478, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8564898371696472, + "rewards/margins": 1.0837904214859009, + "rewards/rejected": -1.9402803182601929, + "step": 196 + }, + { + "epoch": 0.3932135728542914, + "grad_norm": 14.833698621408654, + "learning_rate": 3.809964764925198e-07, + "logits/chosen": -19.650461196899414, + "logits/rejected": -18.813337326049805, + "logps/chosen": -556.4791259765625, + "logps/rejected": -641.11083984375, + "loss": 0.4637, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5146292448043823, + "rewards/margins": 1.2214813232421875, + "rewards/rejected": -2.7361104488372803, + "step": 197 + }, + { + "epoch": 0.39520958083832336, + "grad_norm": 13.357898707821814, + "learning_rate": 3.7950675234328256e-07, + "logits/chosen": -20.979297637939453, + "logits/rejected": -20.838703155517578, + "logps/chosen": -502.97607421875, + "logps/rejected": -646.2918090820312, + "loss": 0.4448, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1799731254577637, + "rewards/margins": 1.185584545135498, + "rewards/rejected": -3.3655576705932617, + "step": 198 + }, + { + "epoch": 0.3972055888223553, + "grad_norm": 15.671442333902643, + "learning_rate": 3.780107162176429e-07, + "logits/chosen": -19.549230575561523, + "logits/rejected": -20.148639678955078, + "logps/chosen": -540.7632446289062, + "logps/rejected": -629.897705078125, + "loss": 0.4734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1680128574371338, + "rewards/margins": 0.9218576550483704, + "rewards/rejected": -2.0898704528808594, + "step": 199 + }, + { + "epoch": 0.3992015968063872, + "grad_norm": 14.718831099618205, + "learning_rate": 3.765084410302909e-07, + "logits/chosen": -21.243850708007812, + "logits/rejected": -21.027610778808594, + "logps/chosen": -570.447998046875, + "logps/rejected": -706.76220703125, + "loss": 0.4543, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8626515865325928, + "rewards/margins": 1.4691358804702759, + "rewards/rejected": -3.331787347793579, + "step": 200 + }, + { + "epoch": 0.40119760479041916, + "grad_norm": 17.61224276039994, + "learning_rate": 3.75e-07, + "logits/chosen": -19.843793869018555, + "logits/rejected": -19.291797637939453, + "logps/chosen": -513.7886962890625, + "logps/rejected": -493.4097900390625, + "loss": 0.4852, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8454079627990723, + "rewards/margins": 0.3778110444545746, + "rewards/rejected": -2.223219156265259, + "step": 201 + }, + { + "epoch": 0.4031936127744511, + "grad_norm": 14.06450720707383, + "learning_rate": 3.734854666460577e-07, + "logits/chosen": -19.530370712280273, + "logits/rejected": -19.210784912109375, + "logps/chosen": -490.2061767578125, + "logps/rejected": -568.5958862304688, + "loss": 0.4496, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6006358861923218, + "rewards/margins": 0.7596541047096252, + "rewards/rejected": -2.360290050506592, + "step": 202 + }, + { + "epoch": 0.405189620758483, + "grad_norm": 14.859823057968415, + "learning_rate": 3.7196491478468316e-07, + "logits/chosen": -18.494264602661133, + "logits/rejected": -18.339521408081055, + "logps/chosen": -529.4029541015625, + "logps/rejected": -648.5390625, + "loss": 0.4919, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5195198059082031, + "rewards/margins": 0.6871219873428345, + "rewards/rejected": -2.206641674041748, + "step": 203 + }, + { + "epoch": 0.40718562874251496, + "grad_norm": 13.639472563583217, + "learning_rate": 3.704384185254288e-07, + "logits/chosen": -19.484989166259766, + "logits/rejected": -19.255311965942383, + "logps/chosen": -525.3623046875, + "logps/rejected": -636.157958984375, + "loss": 0.4336, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5300272703170776, + "rewards/margins": 0.6417598724365234, + "rewards/rejected": -2.1717870235443115, + "step": 204 + }, + { + "epoch": 0.4091816367265469, + "grad_norm": 12.901826312150071, + "learning_rate": 3.689060522675688e-07, + "logits/chosen": -20.066303253173828, + "logits/rejected": -20.39885139465332, + "logps/chosen": -607.7792358398438, + "logps/rejected": -796.8943481445312, + "loss": 0.4076, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6671760082244873, + "rewards/margins": 1.1472505331039429, + "rewards/rejected": -2.8144266605377197, + "step": 205 + }, + { + "epoch": 0.4111776447105788, + "grad_norm": 16.04548062088645, + "learning_rate": 3.673678906964727e-07, + "logits/chosen": -20.172260284423828, + "logits/rejected": -19.925201416015625, + "logps/chosen": -427.82568359375, + "logps/rejected": -501.93438720703125, + "loss": 0.4985, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4109199047088623, + "rewards/margins": 0.7991017699241638, + "rewards/rejected": -2.210021734237671, + "step": 206 + }, + { + "epoch": 0.41317365269461076, + "grad_norm": 13.933597686095297, + "learning_rate": 3.658240087799654e-07, + "logits/chosen": -21.581363677978516, + "logits/rejected": -21.107837677001953, + "logps/chosen": -417.2480163574219, + "logps/rejected": -577.9209594726562, + "loss": 0.4376, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2277367115020752, + "rewards/margins": 1.1889203786849976, + "rewards/rejected": -2.4166574478149414, + "step": 207 + }, + { + "epoch": 0.4151696606786427, + "grad_norm": 17.56870232794954, + "learning_rate": 3.6427448176467357e-07, + "logits/chosen": -18.793655395507812, + "logits/rejected": -19.212615966796875, + "logps/chosen": -690.8632202148438, + "logps/rejected": -731.3242797851562, + "loss": 0.4648, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.026750087738037, + "rewards/margins": 0.9612019062042236, + "rewards/rejected": -2.9879517555236816, + "step": 208 + }, + { + "epoch": 0.4171656686626746, + "grad_norm": 12.43534426487344, + "learning_rate": 3.6271938517235765e-07, + "logits/chosen": -19.1177921295166, + "logits/rejected": -18.949975967407227, + "logps/chosen": -414.0655517578125, + "logps/rejected": -504.7939147949219, + "loss": 0.4392, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1112933158874512, + "rewards/margins": 0.9417324066162109, + "rewards/rejected": -2.053025484085083, + "step": 209 + }, + { + "epoch": 0.41916167664670656, + "grad_norm": 14.047345814245144, + "learning_rate": 3.6115879479623183e-07, + "logits/chosen": -20.57330322265625, + "logits/rejected": -20.117856979370117, + "logps/chosen": -516.2587890625, + "logps/rejected": -601.1445922851562, + "loss": 0.446, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6481542587280273, + "rewards/margins": 1.0199745893478394, + "rewards/rejected": -2.6681289672851562, + "step": 210 + }, + { + "epoch": 0.42115768463073855, + "grad_norm": 12.856885130231753, + "learning_rate": 3.595927866972693e-07, + "logits/chosen": -20.086902618408203, + "logits/rejected": -20.204526901245117, + "logps/chosen": -525.0537109375, + "logps/rejected": -619.6348266601562, + "loss": 0.4773, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8990871906280518, + "rewards/margins": 0.745017409324646, + "rewards/rejected": -2.644104480743408, + "step": 211 + }, + { + "epoch": 0.4231536926147705, + "grad_norm": 12.98210694751887, + "learning_rate": 3.580214372004956e-07, + "logits/chosen": -19.950450897216797, + "logits/rejected": -20.33487319946289, + "logps/chosen": -394.83404541015625, + "logps/rejected": -497.79345703125, + "loss": 0.4549, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2214103937149048, + "rewards/margins": 0.9347801804542542, + "rewards/rejected": -2.156190872192383, + "step": 212 + }, + { + "epoch": 0.4251497005988024, + "grad_norm": 19.31248906417738, + "learning_rate": 3.5644482289126813e-07, + "logits/chosen": -18.889362335205078, + "logits/rejected": -19.420923233032227, + "logps/chosen": -548.869873046875, + "logps/rejected": -656.6920166015625, + "loss": 0.515, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7871437072753906, + "rewards/margins": 0.8735736608505249, + "rewards/rejected": -2.660717487335205, + "step": 213 + }, + { + "epoch": 0.42714570858283435, + "grad_norm": 15.762431059083589, + "learning_rate": 3.548630206115443e-07, + "logits/chosen": -20.15378189086914, + "logits/rejected": -20.292001724243164, + "logps/chosen": -404.6817626953125, + "logps/rejected": -477.21563720703125, + "loss": 0.4503, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3599151372909546, + "rewards/margins": 0.5870508551597595, + "rewards/rejected": -1.946966290473938, + "step": 214 + }, + { + "epoch": 0.4291417165668663, + "grad_norm": 11.197156920062987, + "learning_rate": 3.5327610745613546e-07, + "logits/chosen": -19.350055694580078, + "logits/rejected": -19.167869567871094, + "logps/chosen": -528.2733154296875, + "logps/rejected": -597.0432739257812, + "loss": 0.4185, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5914320945739746, + "rewards/margins": 0.8656637668609619, + "rewards/rejected": -2.4570956230163574, + "step": 215 + }, + { + "epoch": 0.4311377245508982, + "grad_norm": 15.40471464271596, + "learning_rate": 3.516841607689501e-07, + "logits/chosen": -19.665361404418945, + "logits/rejected": -19.29046058654785, + "logps/chosen": -510.60125732421875, + "logps/rejected": -581.7286376953125, + "loss": 0.4988, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5553991794586182, + "rewards/margins": 1.0072236061096191, + "rewards/rejected": -2.5626227855682373, + "step": 216 + }, + { + "epoch": 0.43313373253493015, + "grad_norm": 11.966288977941108, + "learning_rate": 3.500872581392238e-07, + "logits/chosen": -19.7708740234375, + "logits/rejected": -20.453022003173828, + "logps/chosen": -386.0968017578125, + "logps/rejected": -516.030029296875, + "loss": 0.4256, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.455343246459961, + "rewards/margins": 0.9787195920944214, + "rewards/rejected": -2.434062957763672, + "step": 217 + }, + { + "epoch": 0.4351297405189621, + "grad_norm": 12.989461741224307, + "learning_rate": 3.4848547739773774e-07, + "logits/chosen": -20.3544864654541, + "logits/rejected": -20.054367065429688, + "logps/chosen": -562.2991943359375, + "logps/rejected": -694.3490600585938, + "loss": 0.4304, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8581453561782837, + "rewards/margins": 1.252414345741272, + "rewards/rejected": -3.1105597019195557, + "step": 218 + }, + { + "epoch": 0.437125748502994, + "grad_norm": 13.490425487755859, + "learning_rate": 3.468788966130257e-07, + "logits/chosen": -19.16351318359375, + "logits/rejected": -18.119314193725586, + "logps/chosen": -550.3583984375, + "logps/rejected": -596.575927734375, + "loss": 0.4787, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.73578679561615, + "rewards/margins": 0.7520574927330017, + "rewards/rejected": -2.487844467163086, + "step": 219 + }, + { + "epoch": 0.43912175648702595, + "grad_norm": 19.70339532784469, + "learning_rate": 3.4526759408756857e-07, + "logits/chosen": -18.91830825805664, + "logits/rejected": -19.164518356323242, + "logps/chosen": -669.055419921875, + "logps/rejected": -708.6876220703125, + "loss": 0.469, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.031832695007324, + "rewards/margins": 0.7691054940223694, + "rewards/rejected": -2.8009378910064697, + "step": 220 + }, + { + "epoch": 0.4411177644710579, + "grad_norm": 13.350681461714572, + "learning_rate": 3.43651648353978e-07, + "logits/chosen": -18.520729064941406, + "logits/rejected": -18.25364875793457, + "logps/chosen": -519.2975463867188, + "logps/rejected": -589.70263671875, + "loss": 0.4508, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2545933723449707, + "rewards/margins": 0.9282989501953125, + "rewards/rejected": -2.182892322540283, + "step": 221 + }, + { + "epoch": 0.4431137724550898, + "grad_norm": 12.985951445968679, + "learning_rate": 3.4203113817116953e-07, + "logits/chosen": -18.722192764282227, + "logits/rejected": -18.830791473388672, + "logps/chosen": -471.1470947265625, + "logps/rejected": -526.8262329101562, + "loss": 0.4195, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1359221935272217, + "rewards/margins": 0.6822535991668701, + "rewards/rejected": -1.8181757926940918, + "step": 222 + }, + { + "epoch": 0.44510978043912175, + "grad_norm": 25.299702854421888, + "learning_rate": 3.40406142520523e-07, + "logits/chosen": -19.63787078857422, + "logits/rejected": -19.73871421813965, + "logps/chosen": -336.0172424316406, + "logps/rejected": -433.79217529296875, + "loss": 0.4428, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9037049412727356, + "rewards/margins": 0.909833550453186, + "rewards/rejected": -1.8135385513305664, + "step": 223 + }, + { + "epoch": 0.4471057884231537, + "grad_norm": 13.981874950399485, + "learning_rate": 3.387767406020343e-07, + "logits/chosen": -20.34281349182129, + "logits/rejected": -20.043916702270508, + "logps/chosen": -561.2789916992188, + "logps/rejected": -721.3209838867188, + "loss": 0.4499, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8252278566360474, + "rewards/margins": 1.3606287240982056, + "rewards/rejected": -3.185856819152832, + "step": 224 + }, + { + "epoch": 0.4491017964071856, + "grad_norm": 13.226395290628279, + "learning_rate": 3.371430118304538e-07, + "logits/chosen": -19.773094177246094, + "logits/rejected": -19.672040939331055, + "logps/chosen": -587.82177734375, + "logps/rejected": -659.1246337890625, + "loss": 0.4739, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.080528736114502, + "rewards/margins": 0.47968238592147827, + "rewards/rejected": -2.560211181640625, + "step": 225 + }, + { + "epoch": 0.45109780439121755, + "grad_norm": 21.18297840943059, + "learning_rate": 3.355050358314172e-07, + "logits/chosen": -18.7939510345459, + "logits/rejected": -19.188337326049805, + "logps/chosen": -548.7808837890625, + "logps/rejected": -600.9039916992188, + "loss": 0.4465, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9692116975784302, + "rewards/margins": 0.954479992389679, + "rewards/rejected": -2.923691749572754, + "step": 226 + }, + { + "epoch": 0.4530938123752495, + "grad_norm": 16.166215514551812, + "learning_rate": 3.338628924375638e-07, + "logits/chosen": -20.488510131835938, + "logits/rejected": -20.845476150512695, + "logps/chosen": -427.76348876953125, + "logps/rejected": -595.1253051757812, + "loss": 0.4742, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2822277545928955, + "rewards/margins": 1.3352444171905518, + "rewards/rejected": -2.6174721717834473, + "step": 227 + }, + { + "epoch": 0.4550898203592814, + "grad_norm": 14.590880103789674, + "learning_rate": 3.322166616846458e-07, + "logits/chosen": -20.881481170654297, + "logits/rejected": -20.86432647705078, + "logps/chosen": -444.5021057128906, + "logps/rejected": -491.39251708984375, + "loss": 0.4437, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5504581928253174, + "rewards/margins": 0.4571852684020996, + "rewards/rejected": -2.007643699645996, + "step": 228 + }, + { + "epoch": 0.45708582834331335, + "grad_norm": 12.92186538925852, + "learning_rate": 3.305664238076278e-07, + "logits/chosen": -20.525470733642578, + "logits/rejected": -19.75753402709961, + "logps/chosen": -395.13714599609375, + "logps/rejected": -545.7861938476562, + "loss": 0.4846, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4842098951339722, + "rewards/margins": 1.059401512145996, + "rewards/rejected": -2.543611526489258, + "step": 229 + }, + { + "epoch": 0.4590818363273453, + "grad_norm": 14.842723434798721, + "learning_rate": 3.289122592367756e-07, + "logits/chosen": -19.929006576538086, + "logits/rejected": -19.978878021240234, + "logps/chosen": -541.9116821289062, + "logps/rejected": -634.8336181640625, + "loss": 0.4491, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7504769563674927, + "rewards/margins": 1.1156678199768066, + "rewards/rejected": -2.866144895553589, + "step": 230 + }, + { + "epoch": 0.46107784431137727, + "grad_norm": 14.4942978574615, + "learning_rate": 3.272542485937368e-07, + "logits/chosen": -19.461790084838867, + "logits/rejected": -19.085315704345703, + "logps/chosen": -499.6000061035156, + "logps/rejected": -576.95849609375, + "loss": 0.4646, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5696638822555542, + "rewards/margins": 0.9026892185211182, + "rewards/rejected": -2.472352981567383, + "step": 231 + }, + { + "epoch": 0.4630738522954092, + "grad_norm": 15.223995717750787, + "learning_rate": 3.2559247268761114e-07, + "logits/chosen": -18.554140090942383, + "logits/rejected": -18.188430786132812, + "logps/chosen": -400.5260009765625, + "logps/rejected": -457.10906982421875, + "loss": 0.4356, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.025871992111206, + "rewards/margins": 0.5790168642997742, + "rewards/rejected": -1.604888916015625, + "step": 232 + }, + { + "epoch": 0.46506986027944114, + "grad_norm": 13.813707701215902, + "learning_rate": 3.2392701251101167e-07, + "logits/chosen": -18.753156661987305, + "logits/rejected": -19.0374755859375, + "logps/chosen": -587.1029663085938, + "logps/rejected": -672.4597778320312, + "loss": 0.466, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9925709962844849, + "rewards/margins": 0.7200486660003662, + "rewards/rejected": -2.7126200199127197, + "step": 233 + }, + { + "epoch": 0.46706586826347307, + "grad_norm": 16.032336387205383, + "learning_rate": 3.222579492361179e-07, + "logits/chosen": -18.763916015625, + "logits/rejected": -19.015911102294922, + "logps/chosen": -566.2586059570312, + "logps/rejected": -674.83642578125, + "loss": 0.4563, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4943814277648926, + "rewards/margins": 1.242883563041687, + "rewards/rejected": -2.73726487159729, + "step": 234 + }, + { + "epoch": 0.469061876247505, + "grad_norm": 18.953609466395125, + "learning_rate": 3.2058536421071914e-07, + "logits/chosen": -18.818294525146484, + "logits/rejected": -18.358341217041016, + "logps/chosen": -554.9633178710938, + "logps/rejected": -607.111328125, + "loss": 0.5233, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9789049625396729, + "rewards/margins": 0.6192256212234497, + "rewards/rejected": -2.598130464553833, + "step": 235 + }, + { + "epoch": 0.47105788423153694, + "grad_norm": 16.02285470658808, + "learning_rate": 3.1890933895424976e-07, + "logits/chosen": -18.656795501708984, + "logits/rejected": -18.9710636138916, + "logps/chosen": -535.3847045898438, + "logps/rejected": -593.01708984375, + "loss": 0.5067, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8721699714660645, + "rewards/margins": 0.48781105875968933, + "rewards/rejected": -2.359980821609497, + "step": 236 + }, + { + "epoch": 0.47305389221556887, + "grad_norm": 14.053925538557477, + "learning_rate": 3.172299551538164e-07, + "logits/chosen": -18.687612533569336, + "logits/rejected": -18.866008758544922, + "logps/chosen": -703.1234130859375, + "logps/rejected": -821.7389526367188, + "loss": 0.4206, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.976850986480713, + "rewards/margins": 0.9709327220916748, + "rewards/rejected": -2.9477834701538086, + "step": 237 + }, + { + "epoch": 0.4750499001996008, + "grad_norm": 13.888076003211115, + "learning_rate": 3.155472946602162e-07, + "logits/chosen": -20.23816680908203, + "logits/rejected": -19.793060302734375, + "logps/chosen": -548.01025390625, + "logps/rejected": -643.1630249023438, + "loss": 0.4279, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6879640817642212, + "rewards/margins": 0.6913233399391174, + "rewards/rejected": -2.3792872428894043, + "step": 238 + }, + { + "epoch": 0.47704590818363274, + "grad_norm": 13.837763164300675, + "learning_rate": 3.1386143948394763e-07, + "logits/chosen": -20.161375045776367, + "logits/rejected": -20.032562255859375, + "logps/chosen": -454.22332763671875, + "logps/rejected": -519.0992431640625, + "loss": 0.4894, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2950642108917236, + "rewards/margins": 0.6701027750968933, + "rewards/rejected": -1.9651669263839722, + "step": 239 + }, + { + "epoch": 0.47904191616766467, + "grad_norm": 17.528357068059776, + "learning_rate": 3.121724717912138e-07, + "logits/chosen": -20.390954971313477, + "logits/rejected": -19.70508575439453, + "logps/chosen": -586.51708984375, + "logps/rejected": -618.0750732421875, + "loss": 0.4483, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9998843669891357, + "rewards/margins": 0.6734704375267029, + "rewards/rejected": -2.6733546257019043, + "step": 240 + }, + { + "epoch": 0.4810379241516966, + "grad_norm": 14.196704035292845, + "learning_rate": 3.104804738999169e-07, + "logits/chosen": -18.2707576751709, + "logits/rejected": -18.1250057220459, + "logps/chosen": -601.682861328125, + "logps/rejected": -642.5757446289062, + "loss": 0.4509, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0958642959594727, + "rewards/margins": 0.6518892049789429, + "rewards/rejected": -2.747753381729126, + "step": 241 + }, + { + "epoch": 0.48303393213572854, + "grad_norm": 16.89149711735207, + "learning_rate": 3.087855282756475e-07, + "logits/chosen": -19.373920440673828, + "logits/rejected": -19.25455093383789, + "logps/chosen": -558.441650390625, + "logps/rejected": -685.5426635742188, + "loss": 0.4812, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7334048748016357, + "rewards/margins": 1.1145367622375488, + "rewards/rejected": -2.8479413986206055, + "step": 242 + }, + { + "epoch": 0.48502994011976047, + "grad_norm": 17.843513648960016, + "learning_rate": 3.0708771752766395e-07, + "logits/chosen": -20.650711059570312, + "logits/rejected": -19.873567581176758, + "logps/chosen": -551.3946533203125, + "logps/rejected": -695.5105590820312, + "loss": 0.4222, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.690913200378418, + "rewards/margins": 1.3804028034210205, + "rewards/rejected": -3.0713162422180176, + "step": 243 + }, + { + "epoch": 0.4870259481037924, + "grad_norm": 14.705398106965339, + "learning_rate": 3.053871244048669e-07, + "logits/chosen": -19.685924530029297, + "logits/rejected": -20.568681716918945, + "logps/chosen": -460.550537109375, + "logps/rejected": -524.4147338867188, + "loss": 0.4803, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4355939626693726, + "rewards/margins": 0.5603222846984863, + "rewards/rejected": -1.9959162473678589, + "step": 244 + }, + { + "epoch": 0.48902195608782434, + "grad_norm": 15.46176286439623, + "learning_rate": 3.036838317917658e-07, + "logits/chosen": -18.501953125, + "logits/rejected": -19.013107299804688, + "logps/chosen": -574.2833251953125, + "logps/rejected": -710.7667236328125, + "loss": 0.4301, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5793968439102173, + "rewards/margins": 1.549849033355713, + "rewards/rejected": -3.1292459964752197, + "step": 245 + }, + { + "epoch": 0.49101796407185627, + "grad_norm": 15.399821902790524, + "learning_rate": 3.0197792270443976e-07, + "logits/chosen": -20.143672943115234, + "logits/rejected": -19.312414169311523, + "logps/chosen": -465.90557861328125, + "logps/rejected": -530.7208251953125, + "loss": 0.4609, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.567827820777893, + "rewards/margins": 0.7017680406570435, + "rewards/rejected": -2.2695958614349365, + "step": 246 + }, + { + "epoch": 0.4930139720558882, + "grad_norm": 13.365354324692614, + "learning_rate": 3.002694802864912e-07, + "logits/chosen": -19.2493896484375, + "logits/rejected": -19.1527099609375, + "logps/chosen": -371.802490234375, + "logps/rejected": -477.4065246582031, + "loss": 0.414, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1133661270141602, + "rewards/margins": 0.8846907019615173, + "rewards/rejected": -1.9980566501617432, + "step": 247 + }, + { + "epoch": 0.49500998003992014, + "grad_norm": 13.171743969163947, + "learning_rate": 2.98558587804993e-07, + "logits/chosen": -20.076560974121094, + "logits/rejected": -20.700763702392578, + "logps/chosen": -525.328125, + "logps/rejected": -625.5068359375, + "loss": 0.3772, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.616844654083252, + "rewards/margins": 1.0423572063446045, + "rewards/rejected": -2.6592018604278564, + "step": 248 + }, + { + "epoch": 0.49700598802395207, + "grad_norm": 13.46509740936646, + "learning_rate": 2.968453286464312e-07, + "logits/chosen": -20.32817840576172, + "logits/rejected": -19.739133834838867, + "logps/chosen": -538.3781127929688, + "logps/rejected": -731.992431640625, + "loss": 0.3985, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7721741199493408, + "rewards/margins": 1.9263598918914795, + "rewards/rejected": -3.6985340118408203, + "step": 249 + }, + { + "epoch": 0.499001996007984, + "grad_norm": 13.647026896565563, + "learning_rate": 2.9512978631264e-07, + "logits/chosen": -19.286752700805664, + "logits/rejected": -19.01239013671875, + "logps/chosen": -533.054931640625, + "logps/rejected": -629.1688232421875, + "loss": 0.4277, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2967782020568848, + "rewards/margins": 1.074138879776001, + "rewards/rejected": -2.3709170818328857, + "step": 250 + }, + { + "epoch": 0.500998003992016, + "grad_norm": 12.433462674366762, + "learning_rate": 2.934120444167326e-07, + "logits/chosen": -19.06900978088379, + "logits/rejected": -19.819610595703125, + "logps/chosen": -459.6295166015625, + "logps/rejected": -622.0855102539062, + "loss": 0.4119, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2915412187576294, + "rewards/margins": 1.6004011631011963, + "rewards/rejected": -2.8919425010681152, + "step": 251 + }, + { + "epoch": 0.5029940119760479, + "grad_norm": 13.330997971956275, + "learning_rate": 2.916921866790256e-07, + "logits/chosen": -19.483095169067383, + "logits/rejected": -19.165523529052734, + "logps/chosen": -636.4345703125, + "logps/rejected": -667.2681884765625, + "loss": 0.4478, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.135575294494629, + "rewards/margins": 0.516217827796936, + "rewards/rejected": -2.6517930030822754, + "step": 252 + }, + { + "epoch": 0.5049900199600799, + "grad_norm": 12.284544480667769, + "learning_rate": 2.899702969229587e-07, + "logits/chosen": -19.874605178833008, + "logits/rejected": -20.1215763092041, + "logps/chosen": -472.7792663574219, + "logps/rejected": -599.262939453125, + "loss": 0.3955, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4192883968353271, + "rewards/margins": 1.275083065032959, + "rewards/rejected": -2.694371461868286, + "step": 253 + }, + { + "epoch": 0.5069860279441117, + "grad_norm": 15.336351403927143, + "learning_rate": 2.8824645907100955e-07, + "logits/chosen": -20.512561798095703, + "logits/rejected": -19.61855697631836, + "logps/chosen": -571.308349609375, + "logps/rejected": -582.1721801757812, + "loss": 0.4667, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6224905252456665, + "rewards/margins": 0.5911039710044861, + "rewards/rejected": -2.213594436645508, + "step": 254 + }, + { + "epoch": 0.5089820359281437, + "grad_norm": 14.878520589776237, + "learning_rate": 2.865207571406029e-07, + "logits/chosen": -19.37316131591797, + "logits/rejected": -19.89968490600586, + "logps/chosen": -524.258056640625, + "logps/rejected": -798.5974731445312, + "loss": 0.4355, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7529133558273315, + "rewards/margins": 2.135753870010376, + "rewards/rejected": -3.888667345046997, + "step": 255 + }, + { + "epoch": 0.5109780439121756, + "grad_norm": 16.354875819925365, + "learning_rate": 2.8479327524001633e-07, + "logits/chosen": -20.016483306884766, + "logits/rejected": -20.36672019958496, + "logps/chosen": -499.89056396484375, + "logps/rejected": -636.91943359375, + "loss": 0.4246, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5554924011230469, + "rewards/margins": 1.422688364982605, + "rewards/rejected": -2.9781811237335205, + "step": 256 + }, + { + "epoch": 0.5129740518962076, + "grad_norm": 20.065788599542866, + "learning_rate": 2.830640975642806e-07, + "logits/chosen": -21.746936798095703, + "logits/rejected": -20.40896987915039, + "logps/chosen": -654.2992553710938, + "logps/rejected": -704.3209838867188, + "loss": 0.4569, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4051055908203125, + "rewards/margins": 0.7097451090812683, + "rewards/rejected": -3.1148507595062256, + "step": 257 + }, + { + "epoch": 0.5149700598802395, + "grad_norm": 15.50568263353288, + "learning_rate": 2.8133330839107604e-07, + "logits/chosen": -19.940874099731445, + "logits/rejected": -19.60299301147461, + "logps/chosen": -545.5746459960938, + "logps/rejected": -667.3504638671875, + "loss": 0.4183, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9626977443695068, + "rewards/margins": 1.1021788120269775, + "rewards/rejected": -3.0648765563964844, + "step": 258 + }, + { + "epoch": 0.5169660678642715, + "grad_norm": 12.835594737631492, + "learning_rate": 2.796009920766253e-07, + "logits/chosen": -19.640548706054688, + "logits/rejected": -19.253395080566406, + "logps/chosen": -555.4740600585938, + "logps/rejected": -731.1700439453125, + "loss": 0.4308, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6881296634674072, + "rewards/margins": 1.8761582374572754, + "rewards/rejected": -3.5642879009246826, + "step": 259 + }, + { + "epoch": 0.5189620758483033, + "grad_norm": 13.23490536958255, + "learning_rate": 2.7786723305158135e-07, + "logits/chosen": -19.513334274291992, + "logits/rejected": -19.781173706054688, + "logps/chosen": -438.3570556640625, + "logps/rejected": -544.1898193359375, + "loss": 0.3835, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.510940432548523, + "rewards/margins": 0.972527027130127, + "rewards/rejected": -2.4834673404693604, + "step": 260 + }, + { + "epoch": 0.5209580838323353, + "grad_norm": 14.163744381342129, + "learning_rate": 2.761321158169134e-07, + "logits/chosen": -19.43235206604004, + "logits/rejected": -19.761728286743164, + "logps/chosen": -659.931396484375, + "logps/rejected": -784.9754028320312, + "loss": 0.4556, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1872830390930176, + "rewards/margins": 1.1087684631347656, + "rewards/rejected": -3.296051502227783, + "step": 261 + }, + { + "epoch": 0.5229540918163673, + "grad_norm": 15.369097445080758, + "learning_rate": 2.7439572493978737e-07, + "logits/chosen": -20.19454002380371, + "logits/rejected": -20.246570587158203, + "logps/chosen": -498.49896240234375, + "logps/rejected": -600.5203247070312, + "loss": 0.4218, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4001764059066772, + "rewards/margins": 1.2271099090576172, + "rewards/rejected": -2.627286434173584, + "step": 262 + }, + { + "epoch": 0.5249500998003992, + "grad_norm": 13.540074391202866, + "learning_rate": 2.726581450494451e-07, + "logits/chosen": -19.287321090698242, + "logits/rejected": -19.06431770324707, + "logps/chosen": -585.2114868164062, + "logps/rejected": -710.250732421875, + "loss": 0.4398, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.719393014907837, + "rewards/margins": 1.5113741159439087, + "rewards/rejected": -3.230767250061035, + "step": 263 + }, + { + "epoch": 0.5269461077844312, + "grad_norm": 16.14836459040014, + "learning_rate": 2.709194608330789e-07, + "logits/chosen": -19.105375289916992, + "logits/rejected": -19.293088912963867, + "logps/chosen": -679.0133056640625, + "logps/rejected": -880.2947998046875, + "loss": 0.4793, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0702013969421387, + "rewards/margins": 1.9545456171035767, + "rewards/rejected": -4.024746894836426, + "step": 264 + }, + { + "epoch": 0.5289421157684631, + "grad_norm": 13.93014612652964, + "learning_rate": 2.6917975703170465e-07, + "logits/chosen": -20.631940841674805, + "logits/rejected": -20.239978790283203, + "logps/chosen": -551.091064453125, + "logps/rejected": -643.3578491210938, + "loss": 0.4321, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8941808938980103, + "rewards/margins": 1.0745964050292969, + "rewards/rejected": -2.9687774181365967, + "step": 265 + }, + { + "epoch": 0.530938123752495, + "grad_norm": 13.940773373889629, + "learning_rate": 2.674391184360313e-07, + "logits/chosen": -20.835819244384766, + "logits/rejected": -20.55775260925293, + "logps/chosen": -579.5352783203125, + "logps/rejected": -668.495361328125, + "loss": 0.4576, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4945411682128906, + "rewards/margins": 0.9422330856323242, + "rewards/rejected": -3.436774730682373, + "step": 266 + }, + { + "epoch": 0.5329341317365269, + "grad_norm": 15.347566297152046, + "learning_rate": 2.6569762988232837e-07, + "logits/chosen": -20.884387969970703, + "logits/rejected": -20.510915756225586, + "logps/chosen": -554.3668823242188, + "logps/rejected": -760.4378051757812, + "loss": 0.4118, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9943656921386719, + "rewards/margins": 1.6108964681625366, + "rewards/rejected": -3.605262041091919, + "step": 267 + }, + { + "epoch": 0.5349301397205589, + "grad_norm": 15.083964912398525, + "learning_rate": 2.63955376248291e-07, + "logits/chosen": -20.903093338012695, + "logits/rejected": -20.814924240112305, + "logps/chosen": -468.0397644042969, + "logps/rejected": -557.53173828125, + "loss": 0.4452, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.621910572052002, + "rewards/margins": 0.8261189460754395, + "rewards/rejected": -2.4480295181274414, + "step": 268 + }, + { + "epoch": 0.5369261477045908, + "grad_norm": 21.070254941679767, + "learning_rate": 2.6221244244890336e-07, + "logits/chosen": -19.996227264404297, + "logits/rejected": -19.785118103027344, + "logps/chosen": -508.9876708984375, + "logps/rejected": -666.5833740234375, + "loss": 0.5038, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7276027202606201, + "rewards/margins": 1.1843074560165405, + "rewards/rejected": -2.91191029548645, + "step": 269 + }, + { + "epoch": 0.5389221556886228, + "grad_norm": 13.987780368181504, + "learning_rate": 2.6046891343229986e-07, + "logits/chosen": -19.805574417114258, + "logits/rejected": -20.00739288330078, + "logps/chosen": -516.5226440429688, + "logps/rejected": -636.6571044921875, + "loss": 0.4289, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.874422311782837, + "rewards/margins": 1.1742898225784302, + "rewards/rejected": -3.0487117767333984, + "step": 270 + }, + { + "epoch": 0.5409181636726547, + "grad_norm": 16.160524902073316, + "learning_rate": 2.5872487417562527e-07, + "logits/chosen": -18.61543083190918, + "logits/rejected": -19.55666732788086, + "logps/chosen": -656.6258544921875, + "logps/rejected": -729.110595703125, + "loss": 0.4508, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.253310203552246, + "rewards/margins": 1.0833779573440552, + "rewards/rejected": -3.33668851852417, + "step": 271 + }, + { + "epoch": 0.5429141716566867, + "grad_norm": 16.13794066326181, + "learning_rate": 2.569804096808922e-07, + "logits/chosen": -19.252918243408203, + "logits/rejected": -19.608856201171875, + "logps/chosen": -544.688232421875, + "logps/rejected": -677.6744384765625, + "loss": 0.4402, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7599573135375977, + "rewards/margins": 1.116705060005188, + "rewards/rejected": -2.876662492752075, + "step": 272 + }, + { + "epoch": 0.5449101796407185, + "grad_norm": 15.251714021976639, + "learning_rate": 2.5523560497083924e-07, + "logits/chosen": -19.35736083984375, + "logits/rejected": -19.229040145874023, + "logps/chosen": -520.7670288085938, + "logps/rejected": -589.5751953125, + "loss": 0.4691, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5088708400726318, + "rewards/margins": 0.7264382839202881, + "rewards/rejected": -2.23530912399292, + "step": 273 + }, + { + "epoch": 0.5469061876247505, + "grad_norm": 14.735102764186708, + "learning_rate": 2.5349054508478635e-07, + "logits/chosen": -20.558042526245117, + "logits/rejected": -20.376361846923828, + "logps/chosen": -581.2306518554688, + "logps/rejected": -683.8033447265625, + "loss": 0.4404, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7012970447540283, + "rewards/margins": 1.082260012626648, + "rewards/rejected": -2.783557176589966, + "step": 274 + }, + { + "epoch": 0.5489021956087824, + "grad_norm": 15.97902029283524, + "learning_rate": 2.5174531507449037e-07, + "logits/chosen": -19.68293571472168, + "logits/rejected": -19.427963256835938, + "logps/chosen": -575.7290649414062, + "logps/rejected": -604.75634765625, + "loss": 0.4362, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.912692666053772, + "rewards/margins": 0.48665523529052734, + "rewards/rejected": -2.3993477821350098, + "step": 275 + }, + { + "epoch": 0.5508982035928144, + "grad_norm": 13.384588758075404, + "learning_rate": 2.5e-07, + "logits/chosen": -19.830730438232422, + "logits/rejected": -20.07389259338379, + "logps/chosen": -524.1917114257812, + "logps/rejected": -626.604736328125, + "loss": 0.4024, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6113872528076172, + "rewards/margins": 1.069969654083252, + "rewards/rejected": -2.681356906890869, + "step": 276 + }, + { + "epoch": 0.5528942115768463, + "grad_norm": 14.356201234303862, + "learning_rate": 2.482546849255096e-07, + "logits/chosen": -20.20763397216797, + "logits/rejected": -19.939579010009766, + "logps/chosen": -454.4245300292969, + "logps/rejected": -588.796630859375, + "loss": 0.4057, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4646457433700562, + "rewards/margins": 1.3196232318878174, + "rewards/rejected": -2.784268617630005, + "step": 277 + }, + { + "epoch": 0.5548902195608783, + "grad_norm": 14.969828094442494, + "learning_rate": 2.465094549152137e-07, + "logits/chosen": -19.216341018676758, + "logits/rejected": -19.786149978637695, + "logps/chosen": -622.8614501953125, + "logps/rejected": -837.9845581054688, + "loss": 0.3914, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7940454483032227, + "rewards/margins": 1.845615267753601, + "rewards/rejected": -3.6396608352661133, + "step": 278 + }, + { + "epoch": 0.5568862275449101, + "grad_norm": 14.687976893363183, + "learning_rate": 2.447643950291608e-07, + "logits/chosen": -18.467819213867188, + "logits/rejected": -18.3898983001709, + "logps/chosen": -511.738037109375, + "logps/rejected": -543.1464233398438, + "loss": 0.4176, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5452011823654175, + "rewards/margins": 0.6293449997901917, + "rewards/rejected": -2.174546241760254, + "step": 279 + }, + { + "epoch": 0.5588822355289421, + "grad_norm": 13.84652065752045, + "learning_rate": 2.430195903191078e-07, + "logits/chosen": -19.583114624023438, + "logits/rejected": -19.34761619567871, + "logps/chosen": -517.0689697265625, + "logps/rejected": -610.4008178710938, + "loss": 0.4747, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4040230512619019, + "rewards/margins": 1.110020399093628, + "rewards/rejected": -2.5140433311462402, + "step": 280 + }, + { + "epoch": 0.5608782435129741, + "grad_norm": 16.303319683710235, + "learning_rate": 2.412751258243748e-07, + "logits/chosen": -19.688095092773438, + "logits/rejected": -19.690338134765625, + "logps/chosen": -671.603271484375, + "logps/rejected": -790.10205078125, + "loss": 0.4407, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3799023628234863, + "rewards/margins": 0.8681031465530396, + "rewards/rejected": -3.2480056285858154, + "step": 281 + }, + { + "epoch": 0.562874251497006, + "grad_norm": 14.15798233649853, + "learning_rate": 2.395310865677001e-07, + "logits/chosen": -19.338281631469727, + "logits/rejected": -18.604310989379883, + "logps/chosen": -572.2766723632812, + "logps/rejected": -635.2128295898438, + "loss": 0.4257, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1379201412200928, + "rewards/margins": 0.7467140555381775, + "rewards/rejected": -2.884634017944336, + "step": 282 + }, + { + "epoch": 0.564870259481038, + "grad_norm": 40.962955478565384, + "learning_rate": 2.3778755755109667e-07, + "logits/chosen": -19.42969512939453, + "logits/rejected": -19.07590675354004, + "logps/chosen": -734.955810546875, + "logps/rejected": -692.2105102539062, + "loss": 0.5513, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.987337350845337, + "rewards/margins": -0.13384895026683807, + "rewards/rejected": -2.8534886837005615, + "step": 283 + }, + { + "epoch": 0.5668662674650699, + "grad_norm": 13.413480487115413, + "learning_rate": 2.3604462375170903e-07, + "logits/chosen": -20.070852279663086, + "logits/rejected": -20.444026947021484, + "logps/chosen": -625.17236328125, + "logps/rejected": -736.5753173828125, + "loss": 0.3945, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.200939655303955, + "rewards/margins": 1.2015022039413452, + "rewards/rejected": -3.4024417400360107, + "step": 284 + }, + { + "epoch": 0.5688622754491018, + "grad_norm": 15.606238042142097, + "learning_rate": 2.3430237011767164e-07, + "logits/chosen": -20.74646759033203, + "logits/rejected": -20.663923263549805, + "logps/chosen": -483.0484313964844, + "logps/rejected": -695.172119140625, + "loss": 0.4272, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.707973599433899, + "rewards/margins": 1.6547538042068481, + "rewards/rejected": -3.362727165222168, + "step": 285 + }, + { + "epoch": 0.5708582834331337, + "grad_norm": 14.723616978428565, + "learning_rate": 2.3256088156396868e-07, + "logits/chosen": -20.511024475097656, + "logits/rejected": -20.550968170166016, + "logps/chosen": -386.5881042480469, + "logps/rejected": -672.30224609375, + "loss": 0.4103, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3118516206741333, + "rewards/margins": 2.176816940307617, + "rewards/rejected": -3.488668918609619, + "step": 286 + }, + { + "epoch": 0.5728542914171657, + "grad_norm": 17.104594605209122, + "learning_rate": 2.3082024296829532e-07, + "logits/chosen": -20.359363555908203, + "logits/rejected": -19.974714279174805, + "logps/chosen": -381.4059143066406, + "logps/rejected": -490.9525146484375, + "loss": 0.4721, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4457169771194458, + "rewards/margins": 0.9019349813461304, + "rewards/rejected": -2.347651958465576, + "step": 287 + }, + { + "epoch": 0.5748502994011976, + "grad_norm": 12.348244088976356, + "learning_rate": 2.2908053916692116e-07, + "logits/chosen": -19.375926971435547, + "logits/rejected": -19.011024475097656, + "logps/chosen": -460.38519287109375, + "logps/rejected": -664.692626953125, + "loss": 0.3529, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5625884532928467, + "rewards/margins": 1.8966760635375977, + "rewards/rejected": -3.4592647552490234, + "step": 288 + }, + { + "epoch": 0.5768463073852296, + "grad_norm": 18.43492666732753, + "learning_rate": 2.2734185495055498e-07, + "logits/chosen": -19.75726318359375, + "logits/rejected": -20.084196090698242, + "logps/chosen": -486.993408203125, + "logps/rejected": -599.3756103515625, + "loss": 0.4596, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9165252447128296, + "rewards/margins": 0.9002697467803955, + "rewards/rejected": -2.8167953491210938, + "step": 289 + }, + { + "epoch": 0.5788423153692615, + "grad_norm": 21.054407486526834, + "learning_rate": 2.2560427506021264e-07, + "logits/chosen": -21.07491111755371, + "logits/rejected": -20.739715576171875, + "logps/chosen": -507.4473571777344, + "logps/rejected": -605.4733276367188, + "loss": 0.4921, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7558261156082153, + "rewards/margins": 0.8664282560348511, + "rewards/rejected": -2.6222543716430664, + "step": 290 + }, + { + "epoch": 0.5808383233532934, + "grad_norm": 14.716476253084066, + "learning_rate": 2.2386788418308665e-07, + "logits/chosen": -19.136695861816406, + "logits/rejected": -19.36062240600586, + "logps/chosen": -466.1701354980469, + "logps/rejected": -597.2568359375, + "loss": 0.4432, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9274959564208984, + "rewards/margins": 0.9277093410491943, + "rewards/rejected": -2.8552052974700928, + "step": 291 + }, + { + "epoch": 0.5828343313373253, + "grad_norm": 14.446885493365698, + "learning_rate": 2.2213276694841865e-07, + "logits/chosen": -19.835556030273438, + "logits/rejected": -19.720577239990234, + "logps/chosen": -451.08441162109375, + "logps/rejected": -530.003662109375, + "loss": 0.4187, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7320913076400757, + "rewards/margins": 0.7514263987541199, + "rewards/rejected": -2.483517646789551, + "step": 292 + }, + { + "epoch": 0.5848303393213573, + "grad_norm": 17.330269076521912, + "learning_rate": 2.2039900792337474e-07, + "logits/chosen": -21.39358901977539, + "logits/rejected": -21.413705825805664, + "logps/chosen": -463.1463623046875, + "logps/rejected": -583.657470703125, + "loss": 0.4415, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7131471633911133, + "rewards/margins": 1.0683386325836182, + "rewards/rejected": -2.7814857959747314, + "step": 293 + }, + { + "epoch": 0.5868263473053892, + "grad_norm": 14.007129468785923, + "learning_rate": 2.1866669160892389e-07, + "logits/chosen": -18.738121032714844, + "logits/rejected": -19.75026512145996, + "logps/chosen": -575.805419921875, + "logps/rejected": -824.0968017578125, + "loss": 0.4254, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4536679983139038, + "rewards/margins": 2.2131245136260986, + "rewards/rejected": -3.666792392730713, + "step": 294 + }, + { + "epoch": 0.5888223552894212, + "grad_norm": 16.60108694572921, + "learning_rate": 2.1693590243571935e-07, + "logits/chosen": -19.9157657623291, + "logits/rejected": -20.169784545898438, + "logps/chosen": -573.23046875, + "logps/rejected": -665.9788818359375, + "loss": 0.3684, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9368046522140503, + "rewards/margins": 0.7889972925186157, + "rewards/rejected": -2.725801706314087, + "step": 295 + }, + { + "epoch": 0.590818363273453, + "grad_norm": 14.64139947604633, + "learning_rate": 2.152067247599837e-07, + "logits/chosen": -20.16104507446289, + "logits/rejected": -20.16781997680664, + "logps/chosen": -652.9393920898438, + "logps/rejected": -674.0302734375, + "loss": 0.4555, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5257301330566406, + "rewards/margins": 0.5331373810768127, + "rewards/rejected": -3.0588674545288086, + "step": 296 + }, + { + "epoch": 0.592814371257485, + "grad_norm": 15.25905759535438, + "learning_rate": 2.1347924285939712e-07, + "logits/chosen": -20.704172134399414, + "logits/rejected": -19.96450424194336, + "logps/chosen": -507.278564453125, + "logps/rejected": -505.9412841796875, + "loss": 0.4724, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6988550424575806, + "rewards/margins": 0.4168252944946289, + "rewards/rejected": -2.115680456161499, + "step": 297 + }, + { + "epoch": 0.5948103792415169, + "grad_norm": 19.931215357752848, + "learning_rate": 2.117535409289905e-07, + "logits/chosen": -21.177928924560547, + "logits/rejected": -21.209348678588867, + "logps/chosen": -413.62567138671875, + "logps/rejected": -724.2362060546875, + "loss": 0.4562, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4525749683380127, + "rewards/margins": 2.526947259902954, + "rewards/rejected": -3.979522228240967, + "step": 298 + }, + { + "epoch": 0.5968063872255489, + "grad_norm": 15.466070557361297, + "learning_rate": 2.100297030770413e-07, + "logits/chosen": -20.977407455444336, + "logits/rejected": -20.545534133911133, + "logps/chosen": -453.90838623046875, + "logps/rejected": -543.0181884765625, + "loss": 0.4821, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7299668788909912, + "rewards/margins": 0.792263388633728, + "rewards/rejected": -2.5222301483154297, + "step": 299 + }, + { + "epoch": 0.5988023952095808, + "grad_norm": 15.538644854464293, + "learning_rate": 2.0830781332097445e-07, + "logits/chosen": -20.28766441345215, + "logits/rejected": -19.859888076782227, + "logps/chosen": -495.1536560058594, + "logps/rejected": -636.5488891601562, + "loss": 0.4248, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4328924417495728, + "rewards/margins": 1.6545376777648926, + "rewards/rejected": -3.087430000305176, + "step": 300 + }, + { + "epoch": 0.6007984031936128, + "grad_norm": 15.125688011588869, + "learning_rate": 2.065879555832674e-07, + "logits/chosen": -19.8198299407959, + "logits/rejected": -19.781383514404297, + "logps/chosen": -614.9022216796875, + "logps/rejected": -763.259033203125, + "loss": 0.3997, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.6993727684020996, + "rewards/margins": 1.1797199249267578, + "rewards/rejected": -3.8790926933288574, + "step": 301 + }, + { + "epoch": 0.6027944111776448, + "grad_norm": 16.044735764949653, + "learning_rate": 2.0487021368736002e-07, + "logits/chosen": -19.610458374023438, + "logits/rejected": -19.683618545532227, + "logps/chosen": -607.406005859375, + "logps/rejected": -767.1768798828125, + "loss": 0.421, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3819403648376465, + "rewards/margins": 1.3107446432113647, + "rewards/rejected": -3.692685127258301, + "step": 302 + }, + { + "epoch": 0.6047904191616766, + "grad_norm": 23.962926461003274, + "learning_rate": 2.0315467135356878e-07, + "logits/chosen": -20.334712982177734, + "logits/rejected": -20.212602615356445, + "logps/chosen": -501.9886779785156, + "logps/rejected": -551.6432495117188, + "loss": 0.4439, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7299950122833252, + "rewards/margins": 0.6993128061294556, + "rewards/rejected": -2.4293079376220703, + "step": 303 + } + ], + "logging_steps": 1, + "max_steps": 501, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 101, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}