diff --git "a/checkpoint-404/trainer_state.json" "b/checkpoint-404/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-404/trainer_state.json" @@ -0,0 +1,6093 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8063872255489022, + "eval_steps": 500, + "global_step": 404, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001996007984031936, + "grad_norm": 8.273702760320834, + "learning_rate": 9.803921568627451e-09, + "logits/chosen": -15.345624923706055, + "logits/rejected": -15.43127727508545, + "logps/chosen": -309.0523376464844, + "logps/rejected": -315.7975769042969, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.003992015968063872, + "grad_norm": 9.403401939105745, + "learning_rate": 1.9607843137254902e-08, + "logits/chosen": -15.736159324645996, + "logits/rejected": -15.511228561401367, + "logps/chosen": -276.1156311035156, + "logps/rejected": -319.82891845703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.005988023952095809, + "grad_norm": 7.417684586141953, + "learning_rate": 2.941176470588235e-08, + "logits/chosen": -14.069502830505371, + "logits/rejected": -14.952973365783691, + "logps/chosen": -327.70660400390625, + "logps/rejected": -324.401611328125, + "loss": 0.6935, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0006795646040700376, + "rewards/margins": 0.00012883666204288602, + "rewards/rejected": 0.0005507277674041688, + "step": 3 + }, + { + "epoch": 0.007984031936127744, + "grad_norm": 8.870177799116133, + "learning_rate": 3.9215686274509804e-08, + "logits/chosen": -14.754829406738281, + "logits/rejected": -14.156275749206543, + "logps/chosen": -405.5284423828125, + "logps/rejected": -507.5711669921875, + "loss": 0.6927, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00030017876997590065, + "rewards/margins": -0.00034380401484668255, + "rewards/rejected": 4.362566687632352e-05, + "step": 4 + }, + { + "epoch": 0.00998003992015968, + "grad_norm": 8.607915464157758, + "learning_rate": 4.901960784313725e-08, + "logits/chosen": -15.998005867004395, + "logits/rejected": -15.419865608215332, + "logps/chosen": -334.4444580078125, + "logps/rejected": -347.6990051269531, + "loss": 0.6925, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.003209929447621107, + "rewards/margins": 0.006283964961767197, + "rewards/rejected": -0.0030740355141460896, + "step": 5 + }, + { + "epoch": 0.011976047904191617, + "grad_norm": 7.828043502488399, + "learning_rate": 5.88235294117647e-08, + "logits/chosen": -15.476319313049316, + "logits/rejected": -15.254171371459961, + "logps/chosen": -315.6751708984375, + "logps/rejected": -321.661376953125, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004277873318642378, + "rewards/margins": 0.0027148674707859755, + "rewards/rejected": 0.0015630058478564024, + "step": 6 + }, + { + "epoch": 0.013972055888223553, + "grad_norm": 8.411076253189734, + "learning_rate": 6.862745098039216e-08, + "logits/chosen": -15.430569648742676, + "logits/rejected": -15.730286598205566, + "logps/chosen": -329.1186218261719, + "logps/rejected": -333.16375732421875, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0016522119985893369, + "rewards/margins": 0.0027996539138257504, + "rewards/rejected": -0.004451866261661053, + "step": 7 + }, + { + "epoch": 0.015968063872255488, + "grad_norm": 8.071033025816464, + "learning_rate": 7.843137254901961e-08, + "logits/chosen": -14.63811206817627, + "logits/rejected": -15.146449089050293, + "logps/chosen": -418.5869445800781, + "logps/rejected": -409.2070007324219, + "loss": 0.6926, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0014343691291287541, + "rewards/margins": 0.0001833915594033897, + "rewards/rejected": -0.001617760630324483, + "step": 8 + }, + { + "epoch": 0.017964071856287425, + "grad_norm": 8.296615844679023, + "learning_rate": 8.823529411764706e-08, + "logits/chosen": -16.07230567932129, + "logits/rejected": -15.337064743041992, + "logps/chosen": -500.545166015625, + "logps/rejected": -516.0908813476562, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.000825481372885406, + "rewards/margins": -0.00035296427085995674, + "rewards/rejected": 0.0011784456437453628, + "step": 9 + }, + { + "epoch": 0.01996007984031936, + "grad_norm": 7.947789076143285, + "learning_rate": 9.80392156862745e-08, + "logits/chosen": -15.586200714111328, + "logits/rejected": -15.603667259216309, + "logps/chosen": -385.4709777832031, + "logps/rejected": -413.04498291015625, + "loss": 0.6927, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0015043115708976984, + "rewards/margins": -0.0028634597547352314, + "rewards/rejected": 0.001359148183837533, + "step": 10 + }, + { + "epoch": 0.021956087824351298, + "grad_norm": 8.347411190779997, + "learning_rate": 1.0784313725490195e-07, + "logits/chosen": -14.763092994689941, + "logits/rejected": -14.29030990600586, + "logps/chosen": -450.0237731933594, + "logps/rejected": -507.61370849609375, + "loss": 0.6935, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0023990964982658625, + "rewards/margins": -0.003815202508121729, + "rewards/rejected": 0.00141610624268651, + "step": 11 + }, + { + "epoch": 0.023952095808383235, + "grad_norm": 9.064316975676492, + "learning_rate": 1.176470588235294e-07, + "logits/chosen": -15.072341918945312, + "logits/rejected": -14.772862434387207, + "logps/chosen": -335.04266357421875, + "logps/rejected": -371.4832763671875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0030230379197746515, + "rewards/margins": -0.005290627479553223, + "rewards/rejected": 0.0022675893269479275, + "step": 12 + }, + { + "epoch": 0.02594810379241517, + "grad_norm": 7.725863389166651, + "learning_rate": 1.2745098039215685e-07, + "logits/chosen": -16.316631317138672, + "logits/rejected": -16.06757164001465, + "logps/chosen": -419.7452087402344, + "logps/rejected": -390.98773193359375, + "loss": 0.6936, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.001233673538081348, + "rewards/margins": -0.0001909491838887334, + "rewards/rejected": -0.0010427236557006836, + "step": 13 + }, + { + "epoch": 0.027944111776447105, + "grad_norm": 8.42732610297517, + "learning_rate": 1.3725490196078432e-07, + "logits/chosen": -15.35225772857666, + "logits/rejected": -15.612798690795898, + "logps/chosen": -328.98046875, + "logps/rejected": -415.5600280761719, + "loss": 0.6934, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.001978826941922307, + "rewards/margins": 0.00040232675382867455, + "rewards/rejected": 0.001576499780640006, + "step": 14 + }, + { + "epoch": 0.029940119760479042, + "grad_norm": 7.731931977749619, + "learning_rate": 1.4705882352941175e-07, + "logits/chosen": -16.70879554748535, + "logits/rejected": -16.097858428955078, + "logps/chosen": -316.15423583984375, + "logps/rejected": -323.2774353027344, + "loss": 0.6928, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.003133001271635294, + "rewards/margins": -0.0018952846294268966, + "rewards/rejected": -0.0012377167586237192, + "step": 15 + }, + { + "epoch": 0.031936127744510975, + "grad_norm": 7.9669032254132315, + "learning_rate": 1.5686274509803921e-07, + "logits/chosen": -13.231587409973145, + "logits/rejected": -13.3031005859375, + "logps/chosen": -337.7759704589844, + "logps/rejected": -322.94390869140625, + "loss": 0.6933, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0010202788980677724, + "rewards/margins": 0.0009024335886351764, + "rewards/rejected": 0.00011784554226323962, + "step": 16 + }, + { + "epoch": 0.033932135728542916, + "grad_norm": 7.6237992365862075, + "learning_rate": 1.6666666666666665e-07, + "logits/chosen": -14.528105735778809, + "logits/rejected": -14.738598823547363, + "logps/chosen": -241.29954528808594, + "logps/rejected": -247.58921813964844, + "loss": 0.6919, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0020066355355083942, + "rewards/margins": 0.0035532282199710608, + "rewards/rejected": -0.0015465925680473447, + "step": 17 + }, + { + "epoch": 0.03592814371257485, + "grad_norm": 11.366099973012018, + "learning_rate": 1.764705882352941e-07, + "logits/chosen": -15.85158920288086, + "logits/rejected": -15.455657958984375, + "logps/chosen": -346.03350830078125, + "logps/rejected": -324.2177734375, + "loss": 0.6933, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0006118249148130417, + "rewards/margins": -0.0007706499891355634, + "rewards/rejected": 0.00015882489969953895, + "step": 18 + }, + { + "epoch": 0.03792415169660679, + "grad_norm": 7.942061772626445, + "learning_rate": 1.8627450980392158e-07, + "logits/chosen": -14.716241836547852, + "logits/rejected": -14.732061386108398, + "logps/chosen": -283.9400939941406, + "logps/rejected": -269.4703674316406, + "loss": 0.6934, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.00019351489027030766, + "rewards/margins": -0.0023846100084483624, + "rewards/rejected": 0.002191095147281885, + "step": 19 + }, + { + "epoch": 0.03992015968063872, + "grad_norm": 8.146329480525193, + "learning_rate": 1.96078431372549e-07, + "logits/chosen": -13.448728561401367, + "logits/rejected": -13.313655853271484, + "logps/chosen": -375.7843933105469, + "logps/rejected": -317.9215393066406, + "loss": 0.6928, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0009911824017763138, + "rewards/margins": -0.0024417974054813385, + "rewards/rejected": 0.001450614770874381, + "step": 20 + }, + { + "epoch": 0.041916167664670656, + "grad_norm": 8.55196124746064, + "learning_rate": 2.0588235294117645e-07, + "logits/chosen": -15.470209121704102, + "logits/rejected": -15.217321395874023, + "logps/chosen": -299.0631103515625, + "logps/rejected": -327.6717529296875, + "loss": 0.6924, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0023876859340816736, + "rewards/margins": -0.0039983270689845085, + "rewards/rejected": 0.006386012304574251, + "step": 21 + }, + { + "epoch": 0.043912175648702596, + "grad_norm": 8.466171545750429, + "learning_rate": 2.156862745098039e-07, + "logits/chosen": -16.77044677734375, + "logits/rejected": -16.241891860961914, + "logps/chosen": -277.0028076171875, + "logps/rejected": -275.6781005859375, + "loss": 0.6915, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.006396574899554253, + "rewards/margins": 0.0012766977306455374, + "rewards/rejected": 0.005119876936078072, + "step": 22 + }, + { + "epoch": 0.04590818363273453, + "grad_norm": 8.402876138940963, + "learning_rate": 2.2549019607843137e-07, + "logits/chosen": -15.568875312805176, + "logits/rejected": -15.177936553955078, + "logps/chosen": -430.4210510253906, + "logps/rejected": -396.56219482421875, + "loss": 0.6921, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0003438423154875636, + "rewards/margins": 0.0015764283016324043, + "rewards/rejected": -0.001232585753314197, + "step": 23 + }, + { + "epoch": 0.04790419161676647, + "grad_norm": 12.52829981591039, + "learning_rate": 2.352941176470588e-07, + "logits/chosen": -14.164735794067383, + "logits/rejected": -14.398335456848145, + "logps/chosen": -396.947265625, + "logps/rejected": -385.9594421386719, + "loss": 0.6912, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0060534426011145115, + "rewards/margins": 0.0018496987177059054, + "rewards/rejected": 0.004203743766993284, + "step": 24 + }, + { + "epoch": 0.0499001996007984, + "grad_norm": 8.318983318997933, + "learning_rate": 2.4509803921568627e-07, + "logits/chosen": -15.94021987915039, + "logits/rejected": -15.565300941467285, + "logps/chosen": -312.4967346191406, + "logps/rejected": -291.0953369140625, + "loss": 0.6917, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.008275885134935379, + "rewards/margins": 0.006061806343495846, + "rewards/rejected": 0.0022140787914395332, + "step": 25 + }, + { + "epoch": 0.05189620758483034, + "grad_norm": 8.103398791916263, + "learning_rate": 2.549019607843137e-07, + "logits/chosen": -14.540434837341309, + "logits/rejected": -14.473610877990723, + "logps/chosen": -353.2845153808594, + "logps/rejected": -374.34490966796875, + "loss": 0.6915, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.006215238478034735, + "rewards/margins": -4.699244163930416e-05, + "rewards/rejected": 0.006262229755520821, + "step": 26 + }, + { + "epoch": 0.05389221556886228, + "grad_norm": 7.722965287876503, + "learning_rate": 2.6470588235294114e-07, + "logits/chosen": -16.8541202545166, + "logits/rejected": -16.772695541381836, + "logps/chosen": -333.2151184082031, + "logps/rejected": -381.6868591308594, + "loss": 0.6918, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.011300897225737572, + "rewards/margins": 0.00862040463835001, + "rewards/rejected": 0.002680492587387562, + "step": 27 + }, + { + "epoch": 0.05588822355289421, + "grad_norm": 8.21241123134308, + "learning_rate": 2.7450980392156863e-07, + "logits/chosen": -14.656830787658691, + "logits/rejected": -15.223196983337402, + "logps/chosen": -384.7855529785156, + "logps/rejected": -390.5248718261719, + "loss": 0.6912, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005614032968878746, + "rewards/margins": 0.0015372277703136206, + "rewards/rejected": 0.004076804965734482, + "step": 28 + }, + { + "epoch": 0.05788423153692615, + "grad_norm": 8.15008749253195, + "learning_rate": 2.8431372549019607e-07, + "logits/chosen": -15.4053955078125, + "logits/rejected": -15.084259986877441, + "logps/chosen": -397.54937744140625, + "logps/rejected": -373.31109619140625, + "loss": 0.6908, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.009957370348274708, + "rewards/margins": 0.0023600957356393337, + "rewards/rejected": 0.007597275078296661, + "step": 29 + }, + { + "epoch": 0.059880239520958084, + "grad_norm": 8.125634119871668, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -14.482078552246094, + "logits/rejected": -14.186015129089355, + "logps/chosen": -271.9766845703125, + "logps/rejected": -284.2981262207031, + "loss": 0.6909, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.011308508925139904, + "rewards/margins": 0.004111303482204676, + "rewards/rejected": 0.007197204511612654, + "step": 30 + }, + { + "epoch": 0.06187624750499002, + "grad_norm": 7.748089103692524, + "learning_rate": 3.0392156862745094e-07, + "logits/chosen": -15.912099838256836, + "logits/rejected": -15.93221664428711, + "logps/chosen": -304.4377136230469, + "logps/rejected": -315.7114562988281, + "loss": 0.6899, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02117222733795643, + "rewards/margins": 0.006583967246115208, + "rewards/rejected": 0.01458826009184122, + "step": 31 + }, + { + "epoch": 0.06387225548902195, + "grad_norm": 8.060112893776262, + "learning_rate": 3.1372549019607843e-07, + "logits/chosen": -15.711540222167969, + "logits/rejected": -15.569220542907715, + "logps/chosen": -403.3907470703125, + "logps/rejected": -386.66754150390625, + "loss": 0.6901, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0031930492259562016, + "rewards/margins": -0.0004316616104915738, + "rewards/rejected": 0.0036247102543711662, + "step": 32 + }, + { + "epoch": 0.0658682634730539, + "grad_norm": 8.131844895571634, + "learning_rate": 3.2352941176470586e-07, + "logits/chosen": -14.937063217163086, + "logits/rejected": -15.226661682128906, + "logps/chosen": -452.5379638671875, + "logps/rejected": -454.41009521484375, + "loss": 0.6883, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.016051730141043663, + "rewards/margins": 0.009525422938168049, + "rewards/rejected": 0.006526308599859476, + "step": 33 + }, + { + "epoch": 0.06786427145708583, + "grad_norm": 8.419849559119973, + "learning_rate": 3.333333333333333e-07, + "logits/chosen": -15.381107330322266, + "logits/rejected": -15.284709930419922, + "logps/chosen": -401.2351989746094, + "logps/rejected": -390.9241027832031, + "loss": 0.6886, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.024615010246634483, + "rewards/margins": 0.018835801631212234, + "rewards/rejected": 0.005779208615422249, + "step": 34 + }, + { + "epoch": 0.06986027944111776, + "grad_norm": 8.437180869369922, + "learning_rate": 3.431372549019608e-07, + "logits/chosen": -15.466768264770508, + "logits/rejected": -15.167000770568848, + "logps/chosen": -352.6540832519531, + "logps/rejected": -326.68682861328125, + "loss": 0.6875, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.02680317685008049, + "rewards/margins": 0.01718745194375515, + "rewards/rejected": 0.00961572676897049, + "step": 35 + }, + { + "epoch": 0.0718562874251497, + "grad_norm": 7.982258318061745, + "learning_rate": 3.529411764705882e-07, + "logits/chosen": -14.632706642150879, + "logits/rejected": -14.941289901733398, + "logps/chosen": -332.2666015625, + "logps/rejected": -354.9202575683594, + "loss": 0.6871, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.03227221965789795, + "rewards/margins": 0.024501098319888115, + "rewards/rejected": 0.007771119941025972, + "step": 36 + }, + { + "epoch": 0.07385229540918163, + "grad_norm": 8.683716423285436, + "learning_rate": 3.6274509803921566e-07, + "logits/chosen": -16.10747528076172, + "logits/rejected": -15.692405700683594, + "logps/chosen": -350.07916259765625, + "logps/rejected": -328.48040771484375, + "loss": 0.6879, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.026277001947164536, + "rewards/margins": 0.0028891037218272686, + "rewards/rejected": 0.023387901484966278, + "step": 37 + }, + { + "epoch": 0.07584830339321358, + "grad_norm": 8.263903944711704, + "learning_rate": 3.7254901960784315e-07, + "logits/chosen": -16.594873428344727, + "logits/rejected": -16.079730987548828, + "logps/chosen": -308.6328125, + "logps/rejected": -305.5195617675781, + "loss": 0.6857, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03257057070732117, + "rewards/margins": 0.01241181418299675, + "rewards/rejected": 0.020158756524324417, + "step": 38 + }, + { + "epoch": 0.07784431137724551, + "grad_norm": 8.511057638069643, + "learning_rate": 3.8235294117647053e-07, + "logits/chosen": -14.873018264770508, + "logits/rejected": -14.647686004638672, + "logps/chosen": -309.0533447265625, + "logps/rejected": -321.43011474609375, + "loss": 0.6828, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.036281879991292953, + "rewards/margins": 0.025011887773871422, + "rewards/rejected": 0.011269993148744106, + "step": 39 + }, + { + "epoch": 0.07984031936127745, + "grad_norm": 8.338517669067725, + "learning_rate": 3.92156862745098e-07, + "logits/chosen": -15.299338340759277, + "logits/rejected": -15.163370132446289, + "logps/chosen": -302.93212890625, + "logps/rejected": -345.40545654296875, + "loss": 0.685, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.044032178819179535, + "rewards/margins": 0.024382634088397026, + "rewards/rejected": 0.01964954286813736, + "step": 40 + }, + { + "epoch": 0.08183632734530938, + "grad_norm": 8.07172349335598, + "learning_rate": 4.019607843137255e-07, + "logits/chosen": -15.369956016540527, + "logits/rejected": -15.433903694152832, + "logps/chosen": -409.83984375, + "logps/rejected": -412.54180908203125, + "loss": 0.6853, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.035033755004405975, + "rewards/margins": 0.01923045516014099, + "rewards/rejected": 0.015803297981619835, + "step": 41 + }, + { + "epoch": 0.08383233532934131, + "grad_norm": 7.920156576030204, + "learning_rate": 4.117647058823529e-07, + "logits/chosen": -14.02873420715332, + "logits/rejected": -13.986605644226074, + "logps/chosen": -354.6033935546875, + "logps/rejected": -359.84014892578125, + "loss": 0.6864, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.034912168979644775, + "rewards/margins": 0.009367440827190876, + "rewards/rejected": 0.025544727221131325, + "step": 42 + }, + { + "epoch": 0.08582834331337326, + "grad_norm": 8.232720229409079, + "learning_rate": 4.215686274509804e-07, + "logits/chosen": -15.009214401245117, + "logits/rejected": -15.307815551757812, + "logps/chosen": -380.186767578125, + "logps/rejected": -415.4902648925781, + "loss": 0.6858, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.018880976364016533, + "rewards/margins": 0.010581063106656075, + "rewards/rejected": 0.008299913257360458, + "step": 43 + }, + { + "epoch": 0.08782435129740519, + "grad_norm": 8.533444238200468, + "learning_rate": 4.313725490196078e-07, + "logits/chosen": -14.22335433959961, + "logits/rejected": -14.953871726989746, + "logps/chosen": -347.45574951171875, + "logps/rejected": -389.9525451660156, + "loss": 0.6835, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.01024315319955349, + "rewards/margins": -0.00465776864439249, + "rewards/rejected": 0.014900922775268555, + "step": 44 + }, + { + "epoch": 0.08982035928143713, + "grad_norm": 8.038118181406828, + "learning_rate": 4.4117647058823526e-07, + "logits/chosen": -14.101947784423828, + "logits/rejected": -13.89334774017334, + "logps/chosen": -317.0509033203125, + "logps/rejected": -323.69183349609375, + "loss": 0.6817, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.026917992159724236, + "rewards/margins": 0.008457997813820839, + "rewards/rejected": 0.018459992483258247, + "step": 45 + }, + { + "epoch": 0.09181636726546906, + "grad_norm": 8.149720896485583, + "learning_rate": 4.5098039215686274e-07, + "logits/chosen": -14.710037231445312, + "logits/rejected": -14.176923751831055, + "logps/chosen": -445.82379150390625, + "logps/rejected": -488.3534851074219, + "loss": 0.6809, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.023077696561813354, + "rewards/margins": 0.04250966012477875, + "rewards/rejected": -0.019431961700320244, + "step": 46 + }, + { + "epoch": 0.09381237524950099, + "grad_norm": 7.992010994151138, + "learning_rate": 4.6078431372549013e-07, + "logits/chosen": -15.482625961303711, + "logits/rejected": -14.013190269470215, + "logps/chosen": -367.063232421875, + "logps/rejected": -342.14471435546875, + "loss": 0.6774, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0394100584089756, + "rewards/margins": 0.02275776118040085, + "rewards/rejected": 0.016652297228574753, + "step": 47 + }, + { + "epoch": 0.09580838323353294, + "grad_norm": 7.756878381863422, + "learning_rate": 4.705882352941176e-07, + "logits/chosen": -15.568811416625977, + "logits/rejected": -15.423860549926758, + "logps/chosen": -433.0845031738281, + "logps/rejected": -439.37396240234375, + "loss": 0.6776, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0388474203646183, + "rewards/margins": 0.03113992139697075, + "rewards/rejected": 0.007707500830292702, + "step": 48 + }, + { + "epoch": 0.09780439121756487, + "grad_norm": 8.493689270125792, + "learning_rate": 4.803921568627451e-07, + "logits/chosen": -14.434328079223633, + "logits/rejected": -14.237007141113281, + "logps/chosen": -375.27874755859375, + "logps/rejected": -351.34783935546875, + "loss": 0.6807, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.031233904883265495, + "rewards/margins": 0.011667889542877674, + "rewards/rejected": 0.019566014409065247, + "step": 49 + }, + { + "epoch": 0.0998003992015968, + "grad_norm": 7.9896349499421735, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": -15.668423652648926, + "logits/rejected": -15.05298900604248, + "logps/chosen": -307.7395935058594, + "logps/rejected": -320.8984375, + "loss": 0.6748, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06957457959651947, + "rewards/margins": 0.025473352521657944, + "rewards/rejected": 0.044101230800151825, + "step": 50 + }, + { + "epoch": 0.10179640718562874, + "grad_norm": 8.021742888341015, + "learning_rate": 5e-07, + "logits/chosen": -14.430184364318848, + "logits/rejected": -14.525110244750977, + "logps/chosen": -258.0986022949219, + "logps/rejected": -283.91607666015625, + "loss": 0.6743, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.04039158299565315, + "rewards/margins": 0.033271849155426025, + "rewards/rejected": 0.007119735702872276, + "step": 51 + }, + { + "epoch": 0.10379241516966067, + "grad_norm": 8.431429641964243, + "learning_rate": 4.999939076763486e-07, + "logits/chosen": -15.636438369750977, + "logits/rejected": -15.311294555664062, + "logps/chosen": -344.984619140625, + "logps/rejected": -323.0438232421875, + "loss": 0.6688, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.09328603744506836, + "rewards/margins": 0.04766825586557388, + "rewards/rejected": 0.04561777785420418, + "step": 52 + }, + { + "epoch": 0.10578842315369262, + "grad_norm": 8.486118751253972, + "learning_rate": 4.99975631002326e-07, + "logits/chosen": -15.692873001098633, + "logits/rejected": -15.621683120727539, + "logps/chosen": -294.3231201171875, + "logps/rejected": -324.92559814453125, + "loss": 0.6684, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.08665186166763306, + "rewards/margins": 0.0698593482375145, + "rewards/rejected": 0.016792509704828262, + "step": 53 + }, + { + "epoch": 0.10778443113772455, + "grad_norm": 8.584995513046005, + "learning_rate": 4.999451708687113e-07, + "logits/chosen": -13.536327362060547, + "logits/rejected": -14.043939590454102, + "logps/chosen": -316.51129150390625, + "logps/rejected": -340.39251708984375, + "loss": 0.6677, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.04377901926636696, + "rewards/margins": 0.07115273177623749, + "rewards/rejected": -0.027373716235160828, + "step": 54 + }, + { + "epoch": 0.10978043912175649, + "grad_norm": 8.410801379788628, + "learning_rate": 4.999025287600885e-07, + "logits/chosen": -15.176254272460938, + "logits/rejected": -14.865735054016113, + "logps/chosen": -347.4294128417969, + "logps/rejected": -362.15496826171875, + "loss": 0.6672, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0652116909623146, + "rewards/margins": 0.05398234352469444, + "rewards/rejected": 0.01122935302555561, + "step": 55 + }, + { + "epoch": 0.11177644710578842, + "grad_norm": 9.191123619800617, + "learning_rate": 4.998477067547739e-07, + "logits/chosen": -14.366271018981934, + "logits/rejected": -13.491727828979492, + "logps/chosen": -294.74420166015625, + "logps/rejected": -313.5377502441406, + "loss": 0.6695, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08104420453310013, + "rewards/margins": 0.06518866121768951, + "rewards/rejected": 0.015855543315410614, + "step": 56 + }, + { + "epoch": 0.11377245508982035, + "grad_norm": 8.36989537276007, + "learning_rate": 4.997807075247145e-07, + "logits/chosen": -15.414962768554688, + "logits/rejected": -15.15987777709961, + "logps/chosen": -329.4682312011719, + "logps/rejected": -355.2568054199219, + "loss": 0.6699, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05671892687678337, + "rewards/margins": 0.05990144982933998, + "rewards/rejected": -0.0031825248152017593, + "step": 57 + }, + { + "epoch": 0.1157684630738523, + "grad_norm": 7.963033814703697, + "learning_rate": 4.997015343353585e-07, + "logits/chosen": -15.05243968963623, + "logits/rejected": -15.26134967803955, + "logps/chosen": -407.4922790527344, + "logps/rejected": -400.91033935546875, + "loss": 0.6667, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.044123757630586624, + "rewards/margins": 0.07534614205360413, + "rewards/rejected": -0.03122239001095295, + "step": 58 + }, + { + "epoch": 0.11776447105788423, + "grad_norm": 7.537729687084787, + "learning_rate": 4.996101910454953e-07, + "logits/chosen": -14.969869613647461, + "logits/rejected": -14.01164436340332, + "logps/chosen": -338.7287292480469, + "logps/rejected": -342.40234375, + "loss": 0.6676, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02124447375535965, + "rewards/margins": 0.0819094106554985, + "rewards/rejected": -0.060664933174848557, + "step": 59 + }, + { + "epoch": 0.11976047904191617, + "grad_norm": 9.279313627982006, + "learning_rate": 4.995066821070679e-07, + "logits/chosen": -13.303523063659668, + "logits/rejected": -14.182500839233398, + "logps/chosen": -362.7261047363281, + "logps/rejected": -333.9490051269531, + "loss": 0.6709, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.003293365240097046, + "rewards/margins": 0.09314459562301636, + "rewards/rejected": -0.08985123038291931, + "step": 60 + }, + { + "epoch": 0.1217564870259481, + "grad_norm": 8.252068285364153, + "learning_rate": 4.99391012564956e-07, + "logits/chosen": -17.23371124267578, + "logits/rejected": -16.242280960083008, + "logps/chosen": -367.806396484375, + "logps/rejected": -337.6153259277344, + "loss": 0.6547, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03489462658762932, + "rewards/margins": 0.1051090657711029, + "rewards/rejected": -0.07021445780992508, + "step": 61 + }, + { + "epoch": 0.12375249500998003, + "grad_norm": 8.15636533439587, + "learning_rate": 4.9926318805673e-07, + "logits/chosen": -15.824554443359375, + "logits/rejected": -15.663161277770996, + "logps/chosen": -282.3473815917969, + "logps/rejected": -311.18994140625, + "loss": 0.6597, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09248015284538269, + "rewards/margins": 0.09078236669301987, + "rewards/rejected": 0.0016977828927338123, + "step": 62 + }, + { + "epoch": 0.12574850299401197, + "grad_norm": 8.910924596109384, + "learning_rate": 4.991232148123761e-07, + "logits/chosen": -16.677001953125, + "logits/rejected": -16.39942741394043, + "logps/chosen": -460.5334167480469, + "logps/rejected": -422.064453125, + "loss": 0.6681, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.06818778812885284, + "rewards/margins": 0.01459517702460289, + "rewards/rejected": -0.08278295397758484, + "step": 63 + }, + { + "epoch": 0.1277445109780439, + "grad_norm": 8.580401141003072, + "learning_rate": 4.989710996539925e-07, + "logits/chosen": -15.317991256713867, + "logits/rejected": -15.26541519165039, + "logps/chosen": -424.9542541503906, + "logps/rejected": -399.70989990234375, + "loss": 0.6564, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08943880349397659, + "rewards/margins": 0.0476187989115715, + "rewards/rejected": -0.1370576024055481, + "step": 64 + }, + { + "epoch": 0.12974051896207583, + "grad_norm": 8.429082566678625, + "learning_rate": 4.988068499954577e-07, + "logits/chosen": -16.077041625976562, + "logits/rejected": -15.868209838867188, + "logps/chosen": -316.7906494140625, + "logps/rejected": -335.0377197265625, + "loss": 0.6581, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.016306515783071518, + "rewards/margins": 0.07164555788040161, + "rewards/rejected": -0.08795207738876343, + "step": 65 + }, + { + "epoch": 0.1317365269461078, + "grad_norm": 8.133889266001447, + "learning_rate": 4.986304738420683e-07, + "logits/chosen": -15.085855484008789, + "logits/rejected": -14.784677505493164, + "logps/chosen": -300.3466796875, + "logps/rejected": -314.3912353515625, + "loss": 0.6578, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04595138877630234, + "rewards/margins": 0.06701233983039856, + "rewards/rejected": -0.021060939878225327, + "step": 66 + }, + { + "epoch": 0.13373253493013973, + "grad_norm": 8.721591720129195, + "learning_rate": 4.984419797901491e-07, + "logits/chosen": -15.171606063842773, + "logits/rejected": -15.278976440429688, + "logps/chosen": -465.72027587890625, + "logps/rejected": -483.4525146484375, + "loss": 0.6438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07418551295995712, + "rewards/margins": 0.11256247758865356, + "rewards/rejected": -0.18674799799919128, + "step": 67 + }, + { + "epoch": 0.13572854291417166, + "grad_norm": 8.689476143706786, + "learning_rate": 4.982413770266342e-07, + "logits/chosen": -16.235498428344727, + "logits/rejected": -15.38123893737793, + "logps/chosen": -363.73419189453125, + "logps/rejected": -350.5956726074219, + "loss": 0.6554, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09112317860126495, + "rewards/margins": 0.10634519904851913, + "rewards/rejected": -0.19746838510036469, + "step": 68 + }, + { + "epoch": 0.1377245508982036, + "grad_norm": 9.320906157232027, + "learning_rate": 4.980286753286194e-07, + "logits/chosen": -14.806767463684082, + "logits/rejected": -15.064220428466797, + "logps/chosen": -229.83612060546875, + "logps/rejected": -257.00848388671875, + "loss": 0.649, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08728949725627899, + "rewards/margins": 0.05588501691818237, + "rewards/rejected": -0.14317449927330017, + "step": 69 + }, + { + "epoch": 0.13972055888223553, + "grad_norm": 8.45599582084097, + "learning_rate": 4.978038850628853e-07, + "logits/chosen": -15.640983581542969, + "logits/rejected": -15.810898780822754, + "logps/chosen": -403.06884765625, + "logps/rejected": -411.408203125, + "loss": 0.6461, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07357379794120789, + "rewards/margins": 0.10018520057201385, + "rewards/rejected": -0.17375899851322174, + "step": 70 + }, + { + "epoch": 0.14171656686626746, + "grad_norm": 8.397438393746963, + "learning_rate": 4.975670171853925e-07, + "logits/chosen": -15.794336318969727, + "logits/rejected": -15.847979545593262, + "logps/chosen": -379.279296875, + "logps/rejected": -363.3298645019531, + "loss": 0.6501, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.12070687115192413, + "rewards/margins": 0.0922447144985199, + "rewards/rejected": -0.21295160055160522, + "step": 71 + }, + { + "epoch": 0.1437125748502994, + "grad_norm": 8.604521034039575, + "learning_rate": 4.973180832407471e-07, + "logits/chosen": -14.604567527770996, + "logits/rejected": -14.695852279663086, + "logps/chosen": -345.739990234375, + "logps/rejected": -436.0314636230469, + "loss": 0.6454, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07075143605470657, + "rewards/margins": 0.16379314661026, + "rewards/rejected": -0.23454459011554718, + "step": 72 + }, + { + "epoch": 0.14570858283433133, + "grad_norm": 8.431953705553173, + "learning_rate": 4.970570953616382e-07, + "logits/chosen": -14.503868103027344, + "logits/rejected": -15.155458450317383, + "logps/chosen": -326.4110107421875, + "logps/rejected": -384.87322998046875, + "loss": 0.648, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15915606915950775, + "rewards/margins": 0.1893479824066162, + "rewards/rejected": -0.34850406646728516, + "step": 73 + }, + { + "epoch": 0.14770459081836326, + "grad_norm": 8.881932037242414, + "learning_rate": 4.96784066268247e-07, + "logits/chosen": -13.779004096984863, + "logits/rejected": -13.429590225219727, + "logps/chosen": -291.79296875, + "logps/rejected": -296.4447937011719, + "loss": 0.643, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.17777323722839355, + "rewards/margins": 0.02108706906437874, + "rewards/rejected": -0.1988603174686432, + "step": 74 + }, + { + "epoch": 0.1497005988023952, + "grad_norm": 8.314122527140741, + "learning_rate": 4.964990092676262e-07, + "logits/chosen": -17.725143432617188, + "logits/rejected": -17.505762100219727, + "logps/chosen": -341.1331787109375, + "logps/rejected": -350.88116455078125, + "loss": 0.6369, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.1543130725622177, + "rewards/margins": 0.12310568988323212, + "rewards/rejected": -0.27741876244544983, + "step": 75 + }, + { + "epoch": 0.15169660678642716, + "grad_norm": 8.48077117653257, + "learning_rate": 4.96201938253052e-07, + "logits/chosen": -16.73549461364746, + "logits/rejected": -16.362672805786133, + "logps/chosen": -395.6476135253906, + "logps/rejected": -469.1103515625, + "loss": 0.6252, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19351297616958618, + "rewards/margins": 0.18316538631916046, + "rewards/rejected": -0.37667837738990784, + "step": 76 + }, + { + "epoch": 0.1536926147704591, + "grad_norm": 8.393604155470431, + "learning_rate": 4.958928677033465e-07, + "logits/chosen": -15.707889556884766, + "logits/rejected": -15.537942886352539, + "logps/chosen": -281.0392761230469, + "logps/rejected": -319.1485900878906, + "loss": 0.629, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06948637962341309, + "rewards/margins": 0.21688398718833923, + "rewards/rejected": -0.28637033700942993, + "step": 77 + }, + { + "epoch": 0.15568862275449102, + "grad_norm": 9.067089889668104, + "learning_rate": 4.955718126821722e-07, + "logits/chosen": -16.561952590942383, + "logits/rejected": -15.844078063964844, + "logps/chosen": -364.40185546875, + "logps/rejected": -347.0330505371094, + "loss": 0.628, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.16794858872890472, + "rewards/margins": 0.05846191197633743, + "rewards/rejected": -0.22641049325466156, + "step": 78 + }, + { + "epoch": 0.15768463073852296, + "grad_norm": 8.940066268140832, + "learning_rate": 4.952387888372978e-07, + "logits/chosen": -15.177964210510254, + "logits/rejected": -15.126307487487793, + "logps/chosen": -411.2934265136719, + "logps/rejected": -388.7569274902344, + "loss": 0.6447, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2629951238632202, + "rewards/margins": 0.11192844063043594, + "rewards/rejected": -0.37492355704307556, + "step": 79 + }, + { + "epoch": 0.1596806387225549, + "grad_norm": 8.869063942219517, + "learning_rate": 4.94893812399836e-07, + "logits/chosen": -15.601551055908203, + "logits/rejected": -16.042139053344727, + "logps/chosen": -344.1158752441406, + "logps/rejected": -439.0846862792969, + "loss": 0.6223, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.15164640545845032, + "rewards/margins": 0.31529542803764343, + "rewards/rejected": -0.46694183349609375, + "step": 80 + }, + { + "epoch": 0.16167664670658682, + "grad_norm": 9.779541683117705, + "learning_rate": 4.945369001834514e-07, + "logits/chosen": -16.39380645751953, + "logits/rejected": -15.366002082824707, + "logps/chosen": -427.91278076171875, + "logps/rejected": -421.6066589355469, + "loss": 0.6294, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.22397363185882568, + "rewards/margins": 0.07985258847475052, + "rewards/rejected": -0.3038262128829956, + "step": 81 + }, + { + "epoch": 0.16367265469061876, + "grad_norm": 9.681655379739604, + "learning_rate": 4.941680695835419e-07, + "logits/chosen": -16.988998413085938, + "logits/rejected": -16.312742233276367, + "logps/chosen": -392.65618896484375, + "logps/rejected": -409.4969787597656, + "loss": 0.6472, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3562784492969513, + "rewards/margins": 0.05219919979572296, + "rewards/rejected": -0.40847766399383545, + "step": 82 + }, + { + "epoch": 0.1656686626746507, + "grad_norm": 9.018644132426164, + "learning_rate": 4.937873385763907e-07, + "logits/chosen": -18.049556732177734, + "logits/rejected": -16.927623748779297, + "logps/chosen": -329.4657287597656, + "logps/rejected": -301.9615783691406, + "loss": 0.617, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3110809922218323, + "rewards/margins": -0.030765339732170105, + "rewards/rejected": -0.280315637588501, + "step": 83 + }, + { + "epoch": 0.16766467065868262, + "grad_norm": 9.744502049109531, + "learning_rate": 4.9339472571829e-07, + "logits/chosen": -16.83614730834961, + "logits/rejected": -15.979333877563477, + "logps/chosen": -302.4591979980469, + "logps/rejected": -308.2034606933594, + "loss": 0.6372, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029211895540356636, + "rewards/margins": 0.14211447536945343, + "rewards/rejected": -0.17132636904716492, + "step": 84 + }, + { + "epoch": 0.16966067864271456, + "grad_norm": 10.062959151859104, + "learning_rate": 4.929902501446366e-07, + "logits/chosen": -16.791458129882812, + "logits/rejected": -16.60369300842285, + "logps/chosen": -301.0384521484375, + "logps/rejected": -344.17340087890625, + "loss": 0.6229, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.21959218382835388, + "rewards/margins": 0.10680700838565826, + "rewards/rejected": -0.32639920711517334, + "step": 85 + }, + { + "epoch": 0.17165668662674652, + "grad_norm": 8.899211999721798, + "learning_rate": 4.925739315689991e-07, + "logits/chosen": -17.578712463378906, + "logits/rejected": -17.100988388061523, + "logps/chosen": -392.40399169921875, + "logps/rejected": -408.375244140625, + "loss": 0.5926, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3149687647819519, + "rewards/margins": 0.22621186077594757, + "rewards/rejected": -0.5411806106567383, + "step": 86 + }, + { + "epoch": 0.17365269461077845, + "grad_norm": 9.058158084220052, + "learning_rate": 4.921457902821578e-07, + "logits/chosen": -18.370519638061523, + "logits/rejected": -18.149328231811523, + "logps/chosen": -506.34832763671875, + "logps/rejected": -465.40228271484375, + "loss": 0.6228, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5572055578231812, + "rewards/margins": 0.24167928099632263, + "rewards/rejected": -0.7988848686218262, + "step": 87 + }, + { + "epoch": 0.17564870259481039, + "grad_norm": 9.924143893065036, + "learning_rate": 4.917058471511148e-07, + "logits/chosen": -17.789403915405273, + "logits/rejected": -17.73256492614746, + "logps/chosen": -473.0888977050781, + "logps/rejected": -505.4393310546875, + "loss": 0.622, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.562219500541687, + "rewards/margins": 0.14176039397716522, + "rewards/rejected": -0.7039799690246582, + "step": 88 + }, + { + "epoch": 0.17764471057884232, + "grad_norm": 9.455855889835503, + "learning_rate": 4.912541236180778e-07, + "logits/chosen": -15.555730819702148, + "logits/rejected": -16.45915985107422, + "logps/chosen": -359.06304931640625, + "logps/rejected": -448.340087890625, + "loss": 0.6132, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.1561770737171173, + "rewards/margins": 0.43770644068717957, + "rewards/rejected": -0.5938835144042969, + "step": 89 + }, + { + "epoch": 0.17964071856287425, + "grad_norm": 9.198885333259362, + "learning_rate": 4.907906416994145e-07, + "logits/chosen": -16.712560653686523, + "logits/rejected": -15.981279373168945, + "logps/chosen": -379.13067626953125, + "logps/rejected": -461.3275451660156, + "loss": 0.621, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15404972434043884, + "rewards/margins": 0.31564387679100037, + "rewards/rejected": -0.4696936309337616, + "step": 90 + }, + { + "epoch": 0.18163672654690619, + "grad_norm": 9.490922115703798, + "learning_rate": 4.903154239845797e-07, + "logits/chosen": -16.743335723876953, + "logits/rejected": -16.130474090576172, + "logps/chosen": -375.4258728027344, + "logps/rejected": -382.15667724609375, + "loss": 0.6204, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.46173471212387085, + "rewards/margins": 0.16935935616493225, + "rewards/rejected": -0.6310940980911255, + "step": 91 + }, + { + "epoch": 0.18363273453093812, + "grad_norm": 9.888840866130433, + "learning_rate": 4.898284936350143e-07, + "logits/chosen": -15.539291381835938, + "logits/rejected": -15.749015808105469, + "logps/chosen": -375.1903991699219, + "logps/rejected": -397.3068542480469, + "loss": 0.6055, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3286465108394623, + "rewards/margins": 0.2511574327945709, + "rewards/rejected": -0.5798039436340332, + "step": 92 + }, + { + "epoch": 0.18562874251497005, + "grad_norm": 9.534547072466589, + "learning_rate": 4.893298743830167e-07, + "logits/chosen": -17.950407028198242, + "logits/rejected": -17.723539352416992, + "logps/chosen": -570.85693359375, + "logps/rejected": -546.0053100585938, + "loss": 0.6082, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6157274842262268, + "rewards/margins": 0.40511518716812134, + "rewards/rejected": -1.0208425521850586, + "step": 93 + }, + { + "epoch": 0.18762475049900199, + "grad_norm": 9.722881859804074, + "learning_rate": 4.888195905305859e-07, + "logits/chosen": -17.128742218017578, + "logits/rejected": -16.86199951171875, + "logps/chosen": -364.7801513671875, + "logps/rejected": -412.03985595703125, + "loss": 0.6212, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35856327414512634, + "rewards/margins": 0.058156758546829224, + "rewards/rejected": -0.4167200028896332, + "step": 94 + }, + { + "epoch": 0.18962075848303392, + "grad_norm": 9.687485216578887, + "learning_rate": 4.882976669482367e-07, + "logits/chosen": -16.509841918945312, + "logits/rejected": -17.33835220336914, + "logps/chosen": -401.19586181640625, + "logps/rejected": -437.3283386230469, + "loss": 0.6115, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.44053059816360474, + "rewards/margins": 0.5870952606201172, + "rewards/rejected": -1.0276257991790771, + "step": 95 + }, + { + "epoch": 0.19161676646706588, + "grad_norm": 9.298831520670507, + "learning_rate": 4.877641290737883e-07, + "logits/chosen": -16.204208374023438, + "logits/rejected": -16.041362762451172, + "logps/chosen": -377.23699951171875, + "logps/rejected": -422.1312561035156, + "loss": 0.594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2721441388130188, + "rewards/margins": 0.31349363923072815, + "rewards/rejected": -0.5856378078460693, + "step": 96 + }, + { + "epoch": 0.1936127744510978, + "grad_norm": 11.267444543343759, + "learning_rate": 4.872190029111241e-07, + "logits/chosen": -17.558420181274414, + "logits/rejected": -17.034278869628906, + "logps/chosen": -502.9958190917969, + "logps/rejected": -536.4631958007812, + "loss": 0.593, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6253632307052612, + "rewards/margins": 0.29862356185913086, + "rewards/rejected": -0.9239866733551025, + "step": 97 + }, + { + "epoch": 0.19560878243512975, + "grad_norm": 9.47144132073207, + "learning_rate": 4.866623150289241e-07, + "logits/chosen": -17.21906852722168, + "logits/rejected": -16.40659523010254, + "logps/chosen": -308.1177062988281, + "logps/rejected": -365.7485656738281, + "loss": 0.5896, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20060190558433533, + "rewards/margins": 0.40862858295440674, + "rewards/rejected": -0.6092304587364197, + "step": 98 + }, + { + "epoch": 0.19760479041916168, + "grad_norm": 11.742965565854453, + "learning_rate": 4.860940925593702e-07, + "logits/chosen": -18.529356002807617, + "logits/rejected": -18.07916259765625, + "logps/chosen": -302.5202331542969, + "logps/rejected": -296.69964599609375, + "loss": 0.566, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18639332056045532, + "rewards/margins": 0.23985722661018372, + "rewards/rejected": -0.42625054717063904, + "step": 99 + }, + { + "epoch": 0.1996007984031936, + "grad_norm": 9.838131303202186, + "learning_rate": 4.855143631968242e-07, + "logits/chosen": -16.533798217773438, + "logits/rejected": -17.409114837646484, + "logps/chosen": -452.28173828125, + "logps/rejected": -523.7857055664062, + "loss": 0.576, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.42187410593032837, + "rewards/margins": 0.3787775933742523, + "rewards/rejected": -0.8006517887115479, + "step": 100 + }, + { + "epoch": 0.20159680638722555, + "grad_norm": 9.784365965262033, + "learning_rate": 4.849231551964771e-07, + "logits/chosen": -17.89838218688965, + "logits/rejected": -17.24924087524414, + "logps/chosen": -389.7870178222656, + "logps/rejected": -441.84857177734375, + "loss": 0.6049, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5550696849822998, + "rewards/margins": 0.31255972385406494, + "rewards/rejected": -0.86762934923172, + "step": 101 + }, + { + "epoch": 0.20359281437125748, + "grad_norm": 10.254231166703345, + "learning_rate": 4.843204973729728e-07, + "logits/chosen": -17.67897605895996, + "logits/rejected": -16.87425994873047, + "logps/chosen": -350.6794128417969, + "logps/rejected": -377.69073486328125, + "loss": 0.5967, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4413577914237976, + "rewards/margins": 0.24204562604427338, + "rewards/rejected": -0.6834034323692322, + "step": 102 + }, + { + "epoch": 0.2055888223552894, + "grad_norm": 10.177731859208695, + "learning_rate": 4.837064190990036e-07, + "logits/chosen": -17.95716094970703, + "logits/rejected": -18.56848907470703, + "logps/chosen": -363.04754638671875, + "logps/rejected": -426.90582275390625, + "loss": 0.5939, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4591674506664276, + "rewards/margins": 0.3569161593914032, + "rewards/rejected": -0.8160836100578308, + "step": 103 + }, + { + "epoch": 0.20758483033932135, + "grad_norm": 11.665978228628967, + "learning_rate": 4.830809503038781e-07, + "logits/chosen": -18.042129516601562, + "logits/rejected": -18.134384155273438, + "logps/chosen": -371.7164611816406, + "logps/rejected": -375.3335876464844, + "loss": 0.6234, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6777195334434509, + "rewards/margins": 0.03881584107875824, + "rewards/rejected": -0.7165352702140808, + "step": 104 + }, + { + "epoch": 0.20958083832335328, + "grad_norm": 10.307421231401023, + "learning_rate": 4.824441214720628e-07, + "logits/chosen": -17.154827117919922, + "logits/rejected": -16.737272262573242, + "logps/chosen": -415.67706298828125, + "logps/rejected": -465.8774719238281, + "loss": 0.5871, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7462588548660278, + "rewards/margins": 0.246952623128891, + "rewards/rejected": -0.9932115077972412, + "step": 105 + }, + { + "epoch": 0.21157684630738524, + "grad_norm": 10.785145631272416, + "learning_rate": 4.817959636416969e-07, + "logits/chosen": -16.967071533203125, + "logits/rejected": -17.13130760192871, + "logps/chosen": -345.673095703125, + "logps/rejected": -378.43780517578125, + "loss": 0.5532, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4800865054130554, + "rewards/margins": 0.3331327438354492, + "rewards/rejected": -0.8132193684577942, + "step": 106 + }, + { + "epoch": 0.21357285429141717, + "grad_norm": 10.599087532304829, + "learning_rate": 4.811365084030783e-07, + "logits/chosen": -15.638040542602539, + "logits/rejected": -16.557126998901367, + "logps/chosen": -444.3518371582031, + "logps/rejected": -541.2994995117188, + "loss": 0.5506, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5941789150238037, + "rewards/margins": 0.6498495936393738, + "rewards/rejected": -1.2440284490585327, + "step": 107 + }, + { + "epoch": 0.2155688622754491, + "grad_norm": 11.345714476429055, + "learning_rate": 4.804657878971251e-07, + "logits/chosen": -17.96299934387207, + "logits/rejected": -18.472097396850586, + "logps/chosen": -431.6180114746094, + "logps/rejected": -497.6167297363281, + "loss": 0.5714, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7720993757247925, + "rewards/margins": 0.20353072881698608, + "rewards/rejected": -0.9756300449371338, + "step": 108 + }, + { + "epoch": 0.21756487025948104, + "grad_norm": 10.98517018534807, + "learning_rate": 4.797838348138086e-07, + "logits/chosen": -16.523500442504883, + "logits/rejected": -16.56661605834961, + "logps/chosen": -433.0921325683594, + "logps/rejected": -492.59112548828125, + "loss": 0.5571, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8152545690536499, + "rewards/margins": 0.39452555775642395, + "rewards/rejected": -1.2097800970077515, + "step": 109 + }, + { + "epoch": 0.21956087824351297, + "grad_norm": 10.837354651005112, + "learning_rate": 4.790906823905599e-07, + "logits/chosen": -17.36256980895996, + "logits/rejected": -16.92915153503418, + "logps/chosen": -410.4229431152344, + "logps/rejected": -412.46435546875, + "loss": 0.5787, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5949603319168091, + "rewards/margins": 0.22644329071044922, + "rewards/rejected": -0.8214036226272583, + "step": 110 + }, + { + "epoch": 0.2215568862275449, + "grad_norm": 10.320787241206938, + "learning_rate": 4.783863644106502e-07, + "logits/chosen": -16.421478271484375, + "logits/rejected": -16.64776039123535, + "logps/chosen": -483.7657165527344, + "logps/rejected": -481.812744140625, + "loss": 0.5564, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6031550765037537, + "rewards/margins": 0.19912101328372955, + "rewards/rejected": -0.8022760152816772, + "step": 111 + }, + { + "epoch": 0.22355289421157684, + "grad_norm": 11.107038600485746, + "learning_rate": 4.776709152015442e-07, + "logits/chosen": -18.22406005859375, + "logits/rejected": -18.18329429626465, + "logps/chosen": -334.51348876953125, + "logps/rejected": -383.4963684082031, + "loss": 0.598, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5061721205711365, + "rewards/margins": 0.23278102278709412, + "rewards/rejected": -0.7389531135559082, + "step": 112 + }, + { + "epoch": 0.22554890219560877, + "grad_norm": 10.862434019726457, + "learning_rate": 4.769443696332272e-07, + "logits/chosen": -16.520822525024414, + "logits/rejected": -17.432579040527344, + "logps/chosen": -359.88568115234375, + "logps/rejected": -433.949462890625, + "loss": 0.5686, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4676639437675476, + "rewards/margins": 0.6773457527160645, + "rewards/rejected": -1.1450097560882568, + "step": 113 + }, + { + "epoch": 0.2275449101796407, + "grad_norm": 10.633885886185016, + "learning_rate": 4.762067631165049e-07, + "logits/chosen": -16.13068962097168, + "logits/rejected": -16.44469451904297, + "logps/chosen": -433.1568298339844, + "logps/rejected": -488.2666931152344, + "loss": 0.5095, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5838625431060791, + "rewards/margins": 0.51032954454422, + "rewards/rejected": -1.0941921472549438, + "step": 114 + }, + { + "epoch": 0.22954091816367264, + "grad_norm": 11.311189673193253, + "learning_rate": 4.7545813160127845e-07, + "logits/chosen": -16.38141632080078, + "logits/rejected": -17.378694534301758, + "logps/chosen": -542.2471313476562, + "logps/rejected": -644.93115234375, + "loss": 0.5264, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7417050004005432, + "rewards/margins": 1.1188050508499146, + "rewards/rejected": -1.860509991645813, + "step": 115 + }, + { + "epoch": 0.2315369261477046, + "grad_norm": 11.760352337564447, + "learning_rate": 4.746985115747917e-07, + "logits/chosen": -17.631946563720703, + "logits/rejected": -18.053573608398438, + "logps/chosen": -447.10943603515625, + "logps/rejected": -478.54083251953125, + "loss": 0.5718, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8603832125663757, + "rewards/margins": 0.30809980630874634, + "rewards/rejected": -1.168483018875122, + "step": 116 + }, + { + "epoch": 0.23353293413173654, + "grad_norm": 13.092392132476672, + "learning_rate": 4.739279400598532e-07, + "logits/chosen": -17.92958641052246, + "logits/rejected": -17.64777946472168, + "logps/chosen": -566.686279296875, + "logps/rejected": -626.3263549804688, + "loss": 0.5499, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8065224885940552, + "rewards/margins": 0.33996978402137756, + "rewards/rejected": -1.1464921236038208, + "step": 117 + }, + { + "epoch": 0.23552894211576847, + "grad_norm": 11.695233651191389, + "learning_rate": 4.731464546130314e-07, + "logits/chosen": -18.524166107177734, + "logits/rejected": -17.927783966064453, + "logps/chosen": -480.1812744140625, + "logps/rejected": -493.7731628417969, + "loss": 0.5755, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9772423505783081, + "rewards/margins": 0.1880900263786316, + "rewards/rejected": -1.1653324365615845, + "step": 118 + }, + { + "epoch": 0.2375249500998004, + "grad_norm": 14.981531373880904, + "learning_rate": 4.7235409332282436e-07, + "logits/chosen": -18.190269470214844, + "logits/rejected": -18.184982299804688, + "logps/chosen": -418.0079345703125, + "logps/rejected": -410.58599853515625, + "loss": 0.5681, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8476977944374084, + "rewards/margins": 0.24571648240089417, + "rewards/rejected": -1.093414306640625, + "step": 119 + }, + { + "epoch": 0.23952095808383234, + "grad_norm": 12.8327990641794, + "learning_rate": 4.7155089480780365e-07, + "logits/chosen": -17.30999183654785, + "logits/rejected": -17.51451301574707, + "logps/chosen": -473.12420654296875, + "logps/rejected": -552.434814453125, + "loss": 0.5624, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9688943028450012, + "rewards/margins": 0.5823672413825989, + "rewards/rejected": -1.5512614250183105, + "step": 120 + }, + { + "epoch": 0.24151696606786427, + "grad_norm": 12.280307782693178, + "learning_rate": 4.707368982147317e-07, + "logits/chosen": -17.097469329833984, + "logits/rejected": -16.843915939331055, + "logps/chosen": -452.419189453125, + "logps/rejected": -520.343505859375, + "loss": 0.505, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6861757040023804, + "rewards/margins": 0.5018269419670105, + "rewards/rejected": -1.188002586364746, + "step": 121 + }, + { + "epoch": 0.2435129740518962, + "grad_norm": 12.97976538848426, + "learning_rate": 4.6991214321665414e-07, + "logits/chosen": -17.63509750366211, + "logits/rejected": -17.495311737060547, + "logps/chosen": -431.20465087890625, + "logps/rejected": -465.6344299316406, + "loss": 0.5491, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7622246742248535, + "rewards/margins": 0.2962070107460022, + "rewards/rejected": -1.0584317445755005, + "step": 122 + }, + { + "epoch": 0.24550898203592814, + "grad_norm": 11.496195773328525, + "learning_rate": 4.6907667001096585e-07, + "logits/chosen": -17.77838897705078, + "logits/rejected": -18.234045028686523, + "logps/chosen": -474.1757507324219, + "logps/rejected": -698.40478515625, + "loss": 0.5397, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8501319289207458, + "rewards/margins": 1.1107265949249268, + "rewards/rejected": -1.9608584642410278, + "step": 123 + }, + { + "epoch": 0.24750499001996007, + "grad_norm": 12.44934209996877, + "learning_rate": 4.6823051931745237e-07, + "logits/chosen": -18.704727172851562, + "logits/rejected": -18.47835350036621, + "logps/chosen": -354.03076171875, + "logps/rejected": -454.5083312988281, + "loss": 0.5607, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5832939147949219, + "rewards/margins": 0.8197604417800903, + "rewards/rejected": -1.4030543565750122, + "step": 124 + }, + { + "epoch": 0.249500998003992, + "grad_norm": 11.46676444609218, + "learning_rate": 4.6737373237630473e-07, + "logits/chosen": -17.123502731323242, + "logits/rejected": -17.59217071533203, + "logps/chosen": -421.08380126953125, + "logps/rejected": -530.366455078125, + "loss": 0.5211, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0707979202270508, + "rewards/margins": 0.6613157391548157, + "rewards/rejected": -1.7321135997772217, + "step": 125 + }, + { + "epoch": 0.25149700598802394, + "grad_norm": 12.183293436350857, + "learning_rate": 4.6650635094610966e-07, + "logits/chosen": -18.38361358642578, + "logits/rejected": -18.456607818603516, + "logps/chosen": -464.5298156738281, + "logps/rejected": -517.3018798828125, + "loss": 0.5149, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0340361595153809, + "rewards/margins": 0.34383463859558105, + "rewards/rejected": -1.3778706789016724, + "step": 126 + }, + { + "epoch": 0.25349301397205587, + "grad_norm": 11.999072355533364, + "learning_rate": 4.6562841730181435e-07, + "logits/chosen": -19.123811721801758, + "logits/rejected": -18.296850204467773, + "logps/chosen": -476.5393371582031, + "logps/rejected": -521.8916015625, + "loss": 0.5389, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2099838256835938, + "rewards/margins": 0.3222024738788605, + "rewards/rejected": -1.532186508178711, + "step": 127 + }, + { + "epoch": 0.2554890219560878, + "grad_norm": 12.045125886996352, + "learning_rate": 4.647399742326661e-07, + "logits/chosen": -17.34141731262207, + "logits/rejected": -17.46158218383789, + "logps/chosen": -414.46832275390625, + "logps/rejected": -451.8553466796875, + "loss": 0.5338, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9311251640319824, + "rewards/margins": 0.434777170419693, + "rewards/rejected": -1.365902304649353, + "step": 128 + }, + { + "epoch": 0.25748502994011974, + "grad_norm": 11.977860936951346, + "learning_rate": 4.6384106504012665e-07, + "logits/chosen": -17.99986457824707, + "logits/rejected": -17.715444564819336, + "logps/chosen": -358.08837890625, + "logps/rejected": -396.7147521972656, + "loss": 0.4943, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7580198049545288, + "rewards/margins": 0.34733161330223083, + "rewards/rejected": -1.105351448059082, + "step": 129 + }, + { + "epoch": 0.25948103792415167, + "grad_norm": 11.874157690404008, + "learning_rate": 4.6293173353576186e-07, + "logits/chosen": -19.319414138793945, + "logits/rejected": -19.246450424194336, + "logps/chosen": -460.84991455078125, + "logps/rejected": -561.8507690429688, + "loss": 0.5235, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8761596083641052, + "rewards/margins": 0.9677722454071045, + "rewards/rejected": -1.843931794166565, + "step": 130 + }, + { + "epoch": 0.26147704590818366, + "grad_norm": 11.642857128737496, + "learning_rate": 4.6201202403910643e-07, + "logits/chosen": -18.393766403198242, + "logits/rejected": -18.55428695678711, + "logps/chosen": -434.67999267578125, + "logps/rejected": -488.51824951171875, + "loss": 0.5162, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6767091751098633, + "rewards/margins": 0.5004065632820129, + "rewards/rejected": -1.1771156787872314, + "step": 131 + }, + { + "epoch": 0.2634730538922156, + "grad_norm": 11.491974226815117, + "learning_rate": 4.6108198137550377e-07, + "logits/chosen": -19.24776268005371, + "logits/rejected": -18.658472061157227, + "logps/chosen": -440.7890625, + "logps/rejected": -503.10595703125, + "loss": 0.4921, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9644001722335815, + "rewards/margins": 0.6125253438949585, + "rewards/rejected": -1.57692551612854, + "step": 132 + }, + { + "epoch": 0.2654690618762475, + "grad_norm": 12.899140291726132, + "learning_rate": 4.6014165087392105e-07, + "logits/chosen": -19.86013412475586, + "logits/rejected": -19.67055320739746, + "logps/chosen": -401.9028625488281, + "logps/rejected": -435.5128479003906, + "loss": 0.5094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9691441655158997, + "rewards/margins": 0.3254424035549164, + "rewards/rejected": -1.294586420059204, + "step": 133 + }, + { + "epoch": 0.26746506986027946, + "grad_norm": 16.439092593676218, + "learning_rate": 4.591910783647404e-07, + "logits/chosen": -18.951297760009766, + "logits/rejected": -19.075408935546875, + "logps/chosen": -498.13165283203125, + "logps/rejected": -586.1142578125, + "loss": 0.5117, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8596202731132507, + "rewards/margins": 0.8309850096702576, + "rewards/rejected": -1.6906054019927979, + "step": 134 + }, + { + "epoch": 0.2694610778443114, + "grad_norm": 16.957511057498678, + "learning_rate": 4.582303101775248e-07, + "logits/chosen": -18.295085906982422, + "logits/rejected": -17.897132873535156, + "logps/chosen": -457.41119384765625, + "logps/rejected": -517.1851196289062, + "loss": 0.5318, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8927091956138611, + "rewards/margins": 0.5656099319458008, + "rewards/rejected": -1.458319067955017, + "step": 135 + }, + { + "epoch": 0.2714570858283433, + "grad_norm": 12.506389333337408, + "learning_rate": 4.572593931387604e-07, + "logits/chosen": -18.31269073486328, + "logits/rejected": -18.61883544921875, + "logps/chosen": -440.43206787109375, + "logps/rejected": -573.1978759765625, + "loss": 0.4875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9546418190002441, + "rewards/margins": 0.8689519762992859, + "rewards/rejected": -1.8235938549041748, + "step": 136 + }, + { + "epoch": 0.27345309381237526, + "grad_norm": 14.756504766621427, + "learning_rate": 4.5627837456957374e-07, + "logits/chosen": -17.93488121032715, + "logits/rejected": -18.0272274017334, + "logps/chosen": -484.2279052734375, + "logps/rejected": -520.9596557617188, + "loss": 0.5817, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9371182322502136, + "rewards/margins": 0.3456663191318512, + "rewards/rejected": -1.2827844619750977, + "step": 137 + }, + { + "epoch": 0.2754491017964072, + "grad_norm": 13.948355377700631, + "learning_rate": 4.55287302283426e-07, + "logits/chosen": -18.253450393676758, + "logits/rejected": -17.966075897216797, + "logps/chosen": -420.6814880371094, + "logps/rejected": -508.517333984375, + "loss": 0.5273, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.033136010169983, + "rewards/margins": 0.7523062825202942, + "rewards/rejected": -1.7854422330856323, + "step": 138 + }, + { + "epoch": 0.2774451097804391, + "grad_norm": 12.827093040680634, + "learning_rate": 4.542862245837821e-07, + "logits/chosen": -18.383729934692383, + "logits/rejected": -18.069950103759766, + "logps/chosen": -398.27105712890625, + "logps/rejected": -538.0073852539062, + "loss": 0.4845, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9848905205726624, + "rewards/margins": 0.9015528559684753, + "rewards/rejected": -1.8864431381225586, + "step": 139 + }, + { + "epoch": 0.27944111776447106, + "grad_norm": 16.782793626489322, + "learning_rate": 4.5327519026175686e-07, + "logits/chosen": -19.082073211669922, + "logits/rejected": -18.443805694580078, + "logps/chosen": -403.085693359375, + "logps/rejected": -444.4017639160156, + "loss": 0.5388, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9073647260665894, + "rewards/margins": 0.35679662227630615, + "rewards/rejected": -1.264161467552185, + "step": 140 + }, + { + "epoch": 0.281437125748503, + "grad_norm": 11.997391966133737, + "learning_rate": 4.5225424859373684e-07, + "logits/chosen": -18.931427001953125, + "logits/rejected": -18.6968936920166, + "logps/chosen": -479.6912841796875, + "logps/rejected": -543.6511840820312, + "loss": 0.5032, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1790649890899658, + "rewards/margins": 0.5665948987007141, + "rewards/rejected": -1.7456599473953247, + "step": 141 + }, + { + "epoch": 0.2834331337325349, + "grad_norm": 16.73284107773462, + "learning_rate": 4.512234493389785e-07, + "logits/chosen": -18.50774383544922, + "logits/rejected": -17.77086067199707, + "logps/chosen": -400.50604248046875, + "logps/rejected": -467.09259033203125, + "loss": 0.5364, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9760237336158752, + "rewards/margins": 0.4449756145477295, + "rewards/rejected": -1.42099928855896, + "step": 142 + }, + { + "epoch": 0.28542914171656686, + "grad_norm": 14.572499779841467, + "learning_rate": 4.501828427371833e-07, + "logits/chosen": -17.65532684326172, + "logits/rejected": -18.490184783935547, + "logps/chosen": -442.3606872558594, + "logps/rejected": -481.2702941894531, + "loss": 0.5458, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.434018850326538, + "rewards/margins": 0.32890772819519043, + "rewards/rejected": -1.7629268169403076, + "step": 143 + }, + { + "epoch": 0.2874251497005988, + "grad_norm": 12.077196392810203, + "learning_rate": 4.4913247950604903e-07, + "logits/chosen": -17.562110900878906, + "logits/rejected": -18.58519744873047, + "logps/chosen": -514.4218139648438, + "logps/rejected": -602.9424438476562, + "loss": 0.5105, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.461451768875122, + "rewards/margins": 0.6788410544395447, + "rewards/rejected": -2.1402928829193115, + "step": 144 + }, + { + "epoch": 0.2894211576846307, + "grad_norm": 12.508567044518502, + "learning_rate": 4.4807241083879764e-07, + "logits/chosen": -17.865825653076172, + "logits/rejected": -17.985769271850586, + "logps/chosen": -434.7288513183594, + "logps/rejected": -482.5139465332031, + "loss": 0.5639, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4301695823669434, + "rewards/margins": 0.34268102049827576, + "rewards/rejected": -1.772850751876831, + "step": 145 + }, + { + "epoch": 0.29141716566866266, + "grad_norm": 14.199498491697744, + "learning_rate": 4.470026884016804e-07, + "logits/chosen": -19.102584838867188, + "logits/rejected": -19.021583557128906, + "logps/chosen": -458.40802001953125, + "logps/rejected": -512.4900512695312, + "loss": 0.524, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1264287233352661, + "rewards/margins": 0.5926605463027954, + "rewards/rejected": -1.7190892696380615, + "step": 146 + }, + { + "epoch": 0.2934131736526946, + "grad_norm": 11.094319848631054, + "learning_rate": 4.459233643314599e-07, + "logits/chosen": -17.470544815063477, + "logits/rejected": -17.389942169189453, + "logps/chosen": -371.8180236816406, + "logps/rejected": -449.5108947753906, + "loss": 0.4906, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0187515020370483, + "rewards/margins": 0.49795636534690857, + "rewards/rejected": -1.5167080163955688, + "step": 147 + }, + { + "epoch": 0.2954091816367265, + "grad_norm": 11.94286172302262, + "learning_rate": 4.4483449123286855e-07, + "logits/chosen": -17.679344177246094, + "logits/rejected": -18.112321853637695, + "logps/chosen": -568.5883178710938, + "logps/rejected": -588.3480834960938, + "loss": 0.5598, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4919590950012207, + "rewards/margins": 0.22017580270767212, + "rewards/rejected": -1.7121349573135376, + "step": 148 + }, + { + "epoch": 0.29740518962075846, + "grad_norm": 13.921819284966194, + "learning_rate": 4.437361221760449e-07, + "logits/chosen": -19.270662307739258, + "logits/rejected": -19.816879272460938, + "logps/chosen": -478.0981140136719, + "logps/rejected": -570.726318359375, + "loss": 0.4884, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8832843899726868, + "rewards/margins": 0.8125098347663879, + "rewards/rejected": -1.6957943439483643, + "step": 149 + }, + { + "epoch": 0.2994011976047904, + "grad_norm": 13.963951037655912, + "learning_rate": 4.426283106939473e-07, + "logits/chosen": -18.801692962646484, + "logits/rejected": -18.507511138916016, + "logps/chosen": -404.40924072265625, + "logps/rejected": -461.0819091796875, + "loss": 0.5169, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9502855539321899, + "rewards/margins": 0.5388966798782349, + "rewards/rejected": -1.4891822338104248, + "step": 150 + }, + { + "epoch": 0.3013972055888224, + "grad_norm": 11.751403000253367, + "learning_rate": 4.415111107797445e-07, + "logits/chosen": -17.666833877563477, + "logits/rejected": -18.496967315673828, + "logps/chosen": -437.1878967285156, + "logps/rejected": -508.55517578125, + "loss": 0.4824, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1933588981628418, + "rewards/margins": 0.5103777050971985, + "rewards/rejected": -1.703736662864685, + "step": 151 + }, + { + "epoch": 0.3033932135728543, + "grad_norm": 15.664706001428366, + "learning_rate": 4.403845768841842e-07, + "logits/chosen": -19.15441131591797, + "logits/rejected": -19.0245418548584, + "logps/chosen": -517.4494018554688, + "logps/rejected": -583.9824829101562, + "loss": 0.5042, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1839749813079834, + "rewards/margins": 0.7368112206459045, + "rewards/rejected": -1.9207862615585327, + "step": 152 + }, + { + "epoch": 0.30538922155688625, + "grad_norm": 12.964369532636645, + "learning_rate": 4.392487639129391e-07, + "logits/chosen": -17.844913482666016, + "logits/rejected": -18.007299423217773, + "logps/chosen": -430.0899353027344, + "logps/rejected": -520.009033203125, + "loss": 0.4851, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9588636755943298, + "rewards/margins": 0.9035049080848694, + "rewards/rejected": -1.8623687028884888, + "step": 153 + }, + { + "epoch": 0.3073852295409182, + "grad_norm": 12.352390828674043, + "learning_rate": 4.3810372722393106e-07, + "logits/chosen": -18.35409927368164, + "logits/rejected": -18.485759735107422, + "logps/chosen": -433.47747802734375, + "logps/rejected": -456.7944030761719, + "loss": 0.508, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0077626705169678, + "rewards/margins": 0.3574880361557007, + "rewards/rejected": -1.3652507066726685, + "step": 154 + }, + { + "epoch": 0.3093812375249501, + "grad_norm": 12.60101053011718, + "learning_rate": 4.36949522624633e-07, + "logits/chosen": -18.486366271972656, + "logits/rejected": -18.509090423583984, + "logps/chosen": -511.802734375, + "logps/rejected": -631.6337890625, + "loss": 0.483, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3301135301589966, + "rewards/margins": 0.8965414762496948, + "rewards/rejected": -2.2266550064086914, + "step": 155 + }, + { + "epoch": 0.31137724550898205, + "grad_norm": 12.076316766290963, + "learning_rate": 4.357862063693485e-07, + "logits/chosen": -18.479555130004883, + "logits/rejected": -18.734378814697266, + "logps/chosen": -412.6598815917969, + "logps/rejected": -505.7449951171875, + "loss": 0.4914, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.813165545463562, + "rewards/margins": 0.7614642381668091, + "rewards/rejected": -1.574629783630371, + "step": 156 + }, + { + "epoch": 0.313373253493014, + "grad_norm": 14.618551602047281, + "learning_rate": 4.34613835156471e-07, + "logits/chosen": -18.515804290771484, + "logits/rejected": -18.076066970825195, + "logps/chosen": -518.2728271484375, + "logps/rejected": -609.0603637695312, + "loss": 0.475, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4048892259597778, + "rewards/margins": 0.7887292504310608, + "rewards/rejected": -2.1936185359954834, + "step": 157 + }, + { + "epoch": 0.3153692614770459, + "grad_norm": 14.08049532394846, + "learning_rate": 4.3343246612571905e-07, + "logits/chosen": -18.047651290893555, + "logits/rejected": -18.692232131958008, + "logps/chosen": -412.463623046875, + "logps/rejected": -513.392822265625, + "loss": 0.4961, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0158097743988037, + "rewards/margins": 0.7373302578926086, + "rewards/rejected": -1.753139853477478, + "step": 158 + }, + { + "epoch": 0.31736526946107785, + "grad_norm": 12.549764922498149, + "learning_rate": 4.3224215685535287e-07, + "logits/chosen": -18.217510223388672, + "logits/rejected": -18.150440216064453, + "logps/chosen": -432.1754455566406, + "logps/rejected": -474.58477783203125, + "loss": 0.4799, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9655523896217346, + "rewards/margins": 0.6841700673103333, + "rewards/rejected": -1.6497223377227783, + "step": 159 + }, + { + "epoch": 0.3193612774451098, + "grad_norm": 19.934375633780295, + "learning_rate": 4.310429653593669e-07, + "logits/chosen": -19.979171752929688, + "logits/rejected": -20.37800407409668, + "logps/chosen": -465.45416259765625, + "logps/rejected": -549.4246826171875, + "loss": 0.5618, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0206208229064941, + "rewards/margins": 0.7802413702011108, + "rewards/rejected": -1.800862193107605, + "step": 160 + }, + { + "epoch": 0.3213572854291417, + "grad_norm": 14.01791810960553, + "learning_rate": 4.2983495008466273e-07, + "logits/chosen": -19.8514404296875, + "logits/rejected": -19.740142822265625, + "logps/chosen": -557.4789428710938, + "logps/rejected": -551.2369995117188, + "loss": 0.4997, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4763171672821045, + "rewards/margins": 0.34967875480651855, + "rewards/rejected": -1.825995922088623, + "step": 161 + }, + { + "epoch": 0.32335329341317365, + "grad_norm": 12.51564581989701, + "learning_rate": 4.286181699082008e-07, + "logits/chosen": -19.269309997558594, + "logits/rejected": -19.268407821655273, + "logps/chosen": -471.971923828125, + "logps/rejected": -514.5819702148438, + "loss": 0.4595, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1184983253479004, + "rewards/margins": 0.3826027512550354, + "rewards/rejected": -1.5011011362075806, + "step": 162 + }, + { + "epoch": 0.3253493013972056, + "grad_norm": 15.542357689555216, + "learning_rate": 4.273926841341302e-07, + "logits/chosen": -20.759143829345703, + "logits/rejected": -20.3962345123291, + "logps/chosen": -425.91998291015625, + "logps/rejected": -551.6376953125, + "loss": 0.4915, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8230355978012085, + "rewards/margins": 1.2822701930999756, + "rewards/rejected": -2.1053059101104736, + "step": 163 + }, + { + "epoch": 0.3273453093812375, + "grad_norm": 11.931760390211124, + "learning_rate": 4.2615855249089867e-07, + "logits/chosen": -19.185287475585938, + "logits/rejected": -19.267969131469727, + "logps/chosen": -475.9891662597656, + "logps/rejected": -579.8348388671875, + "loss": 0.5064, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2812844514846802, + "rewards/margins": 0.9683108329772949, + "rewards/rejected": -2.2495951652526855, + "step": 164 + }, + { + "epoch": 0.32934131736526945, + "grad_norm": 14.325662785483841, + "learning_rate": 4.249158351283413e-07, + "logits/chosen": -18.518449783325195, + "logits/rejected": -18.800861358642578, + "logps/chosen": -423.35260009765625, + "logps/rejected": -591.2802734375, + "loss": 0.4822, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.156442642211914, + "rewards/margins": 1.339384913444519, + "rewards/rejected": -2.4958276748657227, + "step": 165 + }, + { + "epoch": 0.3313373253493014, + "grad_norm": 13.209121169799278, + "learning_rate": 4.236645926147493e-07, + "logits/chosen": -18.23113250732422, + "logits/rejected": -17.86396598815918, + "logps/chosen": -509.241943359375, + "logps/rejected": -584.48193359375, + "loss": 0.5113, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3406342267990112, + "rewards/margins": 0.4660688638687134, + "rewards/rejected": -1.8067032098770142, + "step": 166 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 12.679678337512888, + "learning_rate": 4.224048859339174e-07, + "logits/chosen": -20.044212341308594, + "logits/rejected": -19.84770965576172, + "logps/chosen": -474.1540222167969, + "logps/rejected": -576.6797485351562, + "loss": 0.4782, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5084277391433716, + "rewards/margins": 0.8898066878318787, + "rewards/rejected": -2.3982343673706055, + "step": 167 + }, + { + "epoch": 0.33532934131736525, + "grad_norm": 12.803928092289635, + "learning_rate": 4.2113677648217216e-07, + "logits/chosen": -19.095563888549805, + "logits/rejected": -18.801067352294922, + "logps/chosen": -410.49737548828125, + "logps/rejected": -484.6048278808594, + "loss": 0.4629, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1591637134552002, + "rewards/margins": 0.666135847568512, + "rewards/rejected": -1.8252995014190674, + "step": 168 + }, + { + "epoch": 0.3373253493013972, + "grad_norm": 14.174173721204978, + "learning_rate": 4.1986032606537916e-07, + "logits/chosen": -16.58905792236328, + "logits/rejected": -16.84238624572754, + "logps/chosen": -569.0613403320312, + "logps/rejected": -614.7280883789062, + "loss": 0.5008, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4528391361236572, + "rewards/margins": 0.6195281744003296, + "rewards/rejected": -2.0723674297332764, + "step": 169 + }, + { + "epoch": 0.3393213572854291, + "grad_norm": 13.080053722751183, + "learning_rate": 4.1857559689593083e-07, + "logits/chosen": -18.27602767944336, + "logits/rejected": -17.57232666015625, + "logps/chosen": -453.475830078125, + "logps/rejected": -515.5191040039062, + "loss": 0.4821, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3552886247634888, + "rewards/margins": 0.832838773727417, + "rewards/rejected": -2.1881275177001953, + "step": 170 + }, + { + "epoch": 0.3413173652694611, + "grad_norm": 12.364971785532722, + "learning_rate": 4.172826515897145e-07, + "logits/chosen": -18.919944763183594, + "logits/rejected": -19.389076232910156, + "logps/chosen": -388.2543029785156, + "logps/rejected": -470.0887451171875, + "loss": 0.4563, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9409610033035278, + "rewards/margins": 0.7529804706573486, + "rewards/rejected": -1.6939414739608765, + "step": 171 + }, + { + "epoch": 0.34331337325349304, + "grad_norm": 13.246390983471949, + "learning_rate": 4.1598155316306037e-07, + "logits/chosen": -18.05150032043457, + "logits/rejected": -18.103609085083008, + "logps/chosen": -473.530029296875, + "logps/rejected": -540.929443359375, + "loss": 0.4981, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.117943525314331, + "rewards/margins": 0.5308763384819031, + "rewards/rejected": -1.6488198041915894, + "step": 172 + }, + { + "epoch": 0.34530938123752497, + "grad_norm": 13.04681058875418, + "learning_rate": 4.146723650296701e-07, + "logits/chosen": -20.524463653564453, + "logits/rejected": -20.365554809570312, + "logps/chosen": -426.28448486328125, + "logps/rejected": -516.8076782226562, + "loss": 0.5333, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.146423101425171, + "rewards/margins": 0.9261961579322815, + "rewards/rejected": -2.0726191997528076, + "step": 173 + }, + { + "epoch": 0.3473053892215569, + "grad_norm": 13.558902213534159, + "learning_rate": 4.133551509975264e-07, + "logits/chosen": -18.120737075805664, + "logits/rejected": -17.530546188354492, + "logps/chosen": -374.5181884765625, + "logps/rejected": -488.8734130859375, + "loss": 0.478, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8048744201660156, + "rewards/margins": 1.0980815887451172, + "rewards/rejected": -1.9029561281204224, + "step": 174 + }, + { + "epoch": 0.34930139720558884, + "grad_norm": 12.399640912045163, + "learning_rate": 4.120299752657827e-07, + "logits/chosen": -20.241302490234375, + "logits/rejected": -19.89463233947754, + "logps/chosen": -435.4852294921875, + "logps/rejected": -600.3646240234375, + "loss": 0.4833, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0055052042007446, + "rewards/margins": 1.095568299293518, + "rewards/rejected": -2.1010735034942627, + "step": 175 + }, + { + "epoch": 0.35129740518962077, + "grad_norm": 18.67991630316, + "learning_rate": 4.106969024216348e-07, + "logits/chosen": -20.45185661315918, + "logits/rejected": -20.16585922241211, + "logps/chosen": -495.82159423828125, + "logps/rejected": -615.032958984375, + "loss": 0.5435, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4768478870391846, + "rewards/margins": 0.9564065933227539, + "rewards/rejected": -2.4332542419433594, + "step": 176 + }, + { + "epoch": 0.3532934131736527, + "grad_norm": 14.433114513707654, + "learning_rate": 4.0935599743717244e-07, + "logits/chosen": -19.229232788085938, + "logits/rejected": -19.95961570739746, + "logps/chosen": -481.91278076171875, + "logps/rejected": -587.878662109375, + "loss": 0.478, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3701449632644653, + "rewards/margins": 1.1796942949295044, + "rewards/rejected": -2.549839496612549, + "step": 177 + }, + { + "epoch": 0.35528942115768464, + "grad_norm": 13.188342082066123, + "learning_rate": 4.080073256662127e-07, + "logits/chosen": -18.213958740234375, + "logits/rejected": -19.102828979492188, + "logps/chosen": -580.2965087890625, + "logps/rejected": -703.9386596679688, + "loss": 0.4952, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4226490259170532, + "rewards/margins": 1.1172631978988647, + "rewards/rejected": -2.539912223815918, + "step": 178 + }, + { + "epoch": 0.35728542914171657, + "grad_norm": 13.727220401466234, + "learning_rate": 4.066509528411151e-07, + "logits/chosen": -19.163558959960938, + "logits/rejected": -20.03252601623535, + "logps/chosen": -416.621826171875, + "logps/rejected": -556.1578979492188, + "loss": 0.4664, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9182752370834351, + "rewards/margins": 1.0827980041503906, + "rewards/rejected": -2.001073122024536, + "step": 179 + }, + { + "epoch": 0.3592814371257485, + "grad_norm": 13.429269056852185, + "learning_rate": 4.0528694506957754e-07, + "logits/chosen": -19.609607696533203, + "logits/rejected": -19.0802059173584, + "logps/chosen": -463.4999694824219, + "logps/rejected": -571.7774047851562, + "loss": 0.4641, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1410191059112549, + "rewards/margins": 0.9015007019042969, + "rewards/rejected": -2.0425198078155518, + "step": 180 + }, + { + "epoch": 0.36127744510978044, + "grad_norm": 13.035911390029865, + "learning_rate": 4.039153688314145e-07, + "logits/chosen": -19.555572509765625, + "logits/rejected": -19.886554718017578, + "logps/chosen": -429.10003662109375, + "logps/rejected": -526.60888671875, + "loss": 0.4332, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4029077291488647, + "rewards/margins": 0.8520881533622742, + "rewards/rejected": -2.254995822906494, + "step": 181 + }, + { + "epoch": 0.36327345309381237, + "grad_norm": 15.508960229044732, + "learning_rate": 4.025362909753169e-07, + "logits/chosen": -18.01523208618164, + "logits/rejected": -18.14191246032715, + "logps/chosen": -425.21063232421875, + "logps/rejected": -510.28546142578125, + "loss": 0.4844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3480522632598877, + "rewards/margins": 0.6659371852874756, + "rewards/rejected": -2.0139894485473633, + "step": 182 + }, + { + "epoch": 0.3652694610778443, + "grad_norm": 16.333189353880638, + "learning_rate": 4.0114977871559377e-07, + "logits/chosen": -19.889019012451172, + "logits/rejected": -20.158649444580078, + "logps/chosen": -388.4324645996094, + "logps/rejected": -496.0431823730469, + "loss": 0.502, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3997801542282104, + "rewards/margins": 0.8247696161270142, + "rewards/rejected": -2.2245497703552246, + "step": 183 + }, + { + "epoch": 0.36726546906187624, + "grad_norm": 12.698608735181692, + "learning_rate": 3.997558996288964e-07, + "logits/chosen": -19.303394317626953, + "logits/rejected": -18.998262405395508, + "logps/chosen": -573.5017700195312, + "logps/rejected": -647.8325805664062, + "loss": 0.4686, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7628663778305054, + "rewards/margins": 0.7538273334503174, + "rewards/rejected": -2.5166938304901123, + "step": 184 + }, + { + "epoch": 0.36926147704590817, + "grad_norm": 14.891574400285474, + "learning_rate": 3.983547216509254e-07, + "logits/chosen": -19.312734603881836, + "logits/rejected": -19.04522132873535, + "logps/chosen": -529.5467529296875, + "logps/rejected": -636.4886474609375, + "loss": 0.4513, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.68673574924469, + "rewards/margins": 0.7229608297348022, + "rewards/rejected": -2.409696578979492, + "step": 185 + }, + { + "epoch": 0.3712574850299401, + "grad_norm": 14.363890009536421, + "learning_rate": 3.9694631307311825e-07, + "logits/chosen": -20.22901153564453, + "logits/rejected": -19.24595832824707, + "logps/chosen": -568.1968994140625, + "logps/rejected": -576.79052734375, + "loss": 0.4763, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8447073698043823, + "rewards/margins": 0.5339704751968384, + "rewards/rejected": -2.3786778450012207, + "step": 186 + }, + { + "epoch": 0.37325349301397204, + "grad_norm": 26.393758306411566, + "learning_rate": 3.9553074253932233e-07, + "logits/chosen": -19.26047134399414, + "logits/rejected": -19.362327575683594, + "logps/chosen": -527.1236572265625, + "logps/rejected": -585.4468994140625, + "loss": 0.5022, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4415079355239868, + "rewards/margins": 0.6697221994400024, + "rewards/rejected": -2.1112301349639893, + "step": 187 + }, + { + "epoch": 0.37524950099800397, + "grad_norm": 13.852299099165053, + "learning_rate": 3.941080790424483e-07, + "logits/chosen": -19.18405532836914, + "logits/rejected": -19.502866744995117, + "logps/chosen": -496.4898376464844, + "logps/rejected": -598.0606079101562, + "loss": 0.5085, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5746562480926514, + "rewards/margins": 0.7243920564651489, + "rewards/rejected": -2.2990481853485107, + "step": 188 + }, + { + "epoch": 0.3772455089820359, + "grad_norm": 12.900703996378226, + "learning_rate": 3.9267839192110797e-07, + "logits/chosen": -18.6906795501709, + "logits/rejected": -19.030370712280273, + "logps/chosen": -507.127197265625, + "logps/rejected": -523.0045776367188, + "loss": 0.4568, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4412367343902588, + "rewards/margins": 0.3045232892036438, + "rewards/rejected": -1.7457599639892578, + "step": 189 + }, + { + "epoch": 0.37924151696606784, + "grad_norm": 14.68383696725854, + "learning_rate": 3.912417508562345e-07, + "logits/chosen": -19.418498992919922, + "logits/rejected": -19.330608367919922, + "logps/chosen": -421.99456787109375, + "logps/rejected": -503.4772033691406, + "loss": 0.4865, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.183841347694397, + "rewards/margins": 0.7099670171737671, + "rewards/rejected": -1.8938082456588745, + "step": 190 + }, + { + "epoch": 0.3812375249500998, + "grad_norm": 11.75575189656368, + "learning_rate": 3.8979822586768666e-07, + "logits/chosen": -21.16440200805664, + "logits/rejected": -20.485124588012695, + "logps/chosen": -514.5684204101562, + "logps/rejected": -571.970703125, + "loss": 0.4889, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4731186628341675, + "rewards/margins": 0.6870366930961609, + "rewards/rejected": -2.1601555347442627, + "step": 191 + }, + { + "epoch": 0.38323353293413176, + "grad_norm": 14.448756790216004, + "learning_rate": 3.88347887310836e-07, + "logits/chosen": -20.147769927978516, + "logits/rejected": -19.916521072387695, + "logps/chosen": -447.382080078125, + "logps/rejected": -597.453125, + "loss": 0.457, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4994301795959473, + "rewards/margins": 1.272138237953186, + "rewards/rejected": -2.771568536758423, + "step": 192 + }, + { + "epoch": 0.3852295409181637, + "grad_norm": 12.23374123752803, + "learning_rate": 3.8689080587313755e-07, + "logits/chosen": -19.599998474121094, + "logits/rejected": -19.84237289428711, + "logps/chosen": -470.7962646484375, + "logps/rejected": -576.8146362304688, + "loss": 0.4514, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.35133695602417, + "rewards/margins": 0.9762927293777466, + "rewards/rejected": -2.327629804611206, + "step": 193 + }, + { + "epoch": 0.3872255489021956, + "grad_norm": 14.698946134402291, + "learning_rate": 3.85427052570685e-07, + "logits/chosen": -19.12531280517578, + "logits/rejected": -18.746444702148438, + "logps/chosen": -468.0195007324219, + "logps/rejected": -533.82177734375, + "loss": 0.483, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.414006233215332, + "rewards/margins": 0.6618885397911072, + "rewards/rejected": -2.075894832611084, + "step": 194 + }, + { + "epoch": 0.38922155688622756, + "grad_norm": 15.811852032735173, + "learning_rate": 3.839566987447491e-07, + "logits/chosen": -19.964893341064453, + "logits/rejected": -19.358760833740234, + "logps/chosen": -411.2624816894531, + "logps/rejected": -456.06390380859375, + "loss": 0.4831, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0391451120376587, + "rewards/margins": 0.6880395412445068, + "rewards/rejected": -1.7271846532821655, + "step": 195 + }, + { + "epoch": 0.3912175648702595, + "grad_norm": 12.305213877004382, + "learning_rate": 3.824798160583012e-07, + "logits/chosen": -19.77585220336914, + "logits/rejected": -19.821577072143555, + "logps/chosen": -391.9801025390625, + "logps/rejected": -514.4705810546875, + "loss": 0.4478, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8564898371696472, + "rewards/margins": 1.0837904214859009, + "rewards/rejected": -1.9402803182601929, + "step": 196 + }, + { + "epoch": 0.3932135728542914, + "grad_norm": 14.833698621408654, + "learning_rate": 3.809964764925198e-07, + "logits/chosen": -19.650461196899414, + "logits/rejected": -18.813337326049805, + "logps/chosen": -556.4791259765625, + "logps/rejected": -641.11083984375, + "loss": 0.4637, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5146292448043823, + "rewards/margins": 1.2214813232421875, + "rewards/rejected": -2.7361104488372803, + "step": 197 + }, + { + "epoch": 0.39520958083832336, + "grad_norm": 13.357898707821814, + "learning_rate": 3.7950675234328256e-07, + "logits/chosen": -20.979297637939453, + "logits/rejected": -20.838703155517578, + "logps/chosen": -502.97607421875, + "logps/rejected": -646.2918090820312, + "loss": 0.4448, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1799731254577637, + "rewards/margins": 1.185584545135498, + "rewards/rejected": -3.3655576705932617, + "step": 198 + }, + { + "epoch": 0.3972055888223553, + "grad_norm": 15.671442333902643, + "learning_rate": 3.780107162176429e-07, + "logits/chosen": -19.549230575561523, + "logits/rejected": -20.148639678955078, + "logps/chosen": -540.7632446289062, + "logps/rejected": -629.897705078125, + "loss": 0.4734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1680128574371338, + "rewards/margins": 0.9218576550483704, + "rewards/rejected": -2.0898704528808594, + "step": 199 + }, + { + "epoch": 0.3992015968063872, + "grad_norm": 14.718831099618205, + "learning_rate": 3.765084410302909e-07, + "logits/chosen": -21.243850708007812, + "logits/rejected": -21.027610778808594, + "logps/chosen": -570.447998046875, + "logps/rejected": -706.76220703125, + "loss": 0.4543, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8626515865325928, + "rewards/margins": 1.4691358804702759, + "rewards/rejected": -3.331787347793579, + "step": 200 + }, + { + "epoch": 0.40119760479041916, + "grad_norm": 17.61224276039994, + "learning_rate": 3.75e-07, + "logits/chosen": -19.843793869018555, + "logits/rejected": -19.291797637939453, + "logps/chosen": -513.7886962890625, + "logps/rejected": -493.4097900390625, + "loss": 0.4852, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8454079627990723, + "rewards/margins": 0.3778110444545746, + "rewards/rejected": -2.223219156265259, + "step": 201 + }, + { + "epoch": 0.4031936127744511, + "grad_norm": 14.06450720707383, + "learning_rate": 3.734854666460577e-07, + "logits/chosen": -19.530370712280273, + "logits/rejected": -19.210784912109375, + "logps/chosen": -490.2061767578125, + "logps/rejected": -568.5958862304688, + "loss": 0.4496, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6006358861923218, + "rewards/margins": 0.7596541047096252, + "rewards/rejected": -2.360290050506592, + "step": 202 + }, + { + "epoch": 0.405189620758483, + "grad_norm": 14.859823057968415, + "learning_rate": 3.7196491478468316e-07, + "logits/chosen": -18.494264602661133, + "logits/rejected": -18.339521408081055, + "logps/chosen": -529.4029541015625, + "logps/rejected": -648.5390625, + "loss": 0.4919, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5195198059082031, + "rewards/margins": 0.6871219873428345, + "rewards/rejected": -2.206641674041748, + "step": 203 + }, + { + "epoch": 0.40718562874251496, + "grad_norm": 13.639472563583217, + "learning_rate": 3.704384185254288e-07, + "logits/chosen": -19.484989166259766, + "logits/rejected": -19.255311965942383, + "logps/chosen": -525.3623046875, + "logps/rejected": -636.157958984375, + "loss": 0.4336, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5300272703170776, + "rewards/margins": 0.6417598724365234, + "rewards/rejected": -2.1717870235443115, + "step": 204 + }, + { + "epoch": 0.4091816367265469, + "grad_norm": 12.901826312150071, + "learning_rate": 3.689060522675688e-07, + "logits/chosen": -20.066303253173828, + "logits/rejected": -20.39885139465332, + "logps/chosen": -607.7792358398438, + "logps/rejected": -796.8943481445312, + "loss": 0.4076, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6671760082244873, + "rewards/margins": 1.1472505331039429, + "rewards/rejected": -2.8144266605377197, + "step": 205 + }, + { + "epoch": 0.4111776447105788, + "grad_norm": 16.04548062088645, + "learning_rate": 3.673678906964727e-07, + "logits/chosen": -20.172260284423828, + "logits/rejected": -19.925201416015625, + "logps/chosen": -427.82568359375, + "logps/rejected": -501.93438720703125, + "loss": 0.4985, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4109199047088623, + "rewards/margins": 0.7991017699241638, + "rewards/rejected": -2.210021734237671, + "step": 206 + }, + { + "epoch": 0.41317365269461076, + "grad_norm": 13.933597686095297, + "learning_rate": 3.658240087799654e-07, + "logits/chosen": -21.581363677978516, + "logits/rejected": -21.107837677001953, + "logps/chosen": -417.2480163574219, + "logps/rejected": -577.9209594726562, + "loss": 0.4376, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2277367115020752, + "rewards/margins": 1.1889203786849976, + "rewards/rejected": -2.4166574478149414, + "step": 207 + }, + { + "epoch": 0.4151696606786427, + "grad_norm": 17.56870232794954, + "learning_rate": 3.6427448176467357e-07, + "logits/chosen": -18.793655395507812, + "logits/rejected": -19.212615966796875, + "logps/chosen": -690.8632202148438, + "logps/rejected": -731.3242797851562, + "loss": 0.4648, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.026750087738037, + "rewards/margins": 0.9612019062042236, + "rewards/rejected": -2.9879517555236816, + "step": 208 + }, + { + "epoch": 0.4171656686626746, + "grad_norm": 12.43534426487344, + "learning_rate": 3.6271938517235765e-07, + "logits/chosen": -19.1177921295166, + "logits/rejected": -18.949975967407227, + "logps/chosen": -414.0655517578125, + "logps/rejected": -504.7939147949219, + "loss": 0.4392, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1112933158874512, + "rewards/margins": 0.9417324066162109, + "rewards/rejected": -2.053025484085083, + "step": 209 + }, + { + "epoch": 0.41916167664670656, + "grad_norm": 14.047345814245144, + "learning_rate": 3.6115879479623183e-07, + "logits/chosen": -20.57330322265625, + "logits/rejected": -20.117856979370117, + "logps/chosen": -516.2587890625, + "logps/rejected": -601.1445922851562, + "loss": 0.446, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6481542587280273, + "rewards/margins": 1.0199745893478394, + "rewards/rejected": -2.6681289672851562, + "step": 210 + }, + { + "epoch": 0.42115768463073855, + "grad_norm": 12.856885130231753, + "learning_rate": 3.595927866972693e-07, + "logits/chosen": -20.086902618408203, + "logits/rejected": -20.204526901245117, + "logps/chosen": -525.0537109375, + "logps/rejected": -619.6348266601562, + "loss": 0.4773, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8990871906280518, + "rewards/margins": 0.745017409324646, + "rewards/rejected": -2.644104480743408, + "step": 211 + }, + { + "epoch": 0.4231536926147705, + "grad_norm": 12.98210694751887, + "learning_rate": 3.580214372004956e-07, + "logits/chosen": -19.950450897216797, + "logits/rejected": -20.33487319946289, + "logps/chosen": -394.83404541015625, + "logps/rejected": -497.79345703125, + "loss": 0.4549, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2214103937149048, + "rewards/margins": 0.9347801804542542, + "rewards/rejected": -2.156190872192383, + "step": 212 + }, + { + "epoch": 0.4251497005988024, + "grad_norm": 19.31248906417738, + "learning_rate": 3.5644482289126813e-07, + "logits/chosen": -18.889362335205078, + "logits/rejected": -19.420923233032227, + "logps/chosen": -548.869873046875, + "logps/rejected": -656.6920166015625, + "loss": 0.515, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7871437072753906, + "rewards/margins": 0.8735736608505249, + "rewards/rejected": -2.660717487335205, + "step": 213 + }, + { + "epoch": 0.42714570858283435, + "grad_norm": 15.762431059083589, + "learning_rate": 3.548630206115443e-07, + "logits/chosen": -20.15378189086914, + "logits/rejected": -20.292001724243164, + "logps/chosen": -404.6817626953125, + "logps/rejected": -477.21563720703125, + "loss": 0.4503, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3599151372909546, + "rewards/margins": 0.5870508551597595, + "rewards/rejected": -1.946966290473938, + "step": 214 + }, + { + "epoch": 0.4291417165668663, + "grad_norm": 11.197156920062987, + "learning_rate": 3.5327610745613546e-07, + "logits/chosen": -19.350055694580078, + "logits/rejected": -19.167869567871094, + "logps/chosen": -528.2733154296875, + "logps/rejected": -597.0432739257812, + "loss": 0.4185, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5914320945739746, + "rewards/margins": 0.8656637668609619, + "rewards/rejected": -2.4570956230163574, + "step": 215 + }, + { + "epoch": 0.4311377245508982, + "grad_norm": 15.40471464271596, + "learning_rate": 3.516841607689501e-07, + "logits/chosen": -19.665361404418945, + "logits/rejected": -19.29046058654785, + "logps/chosen": -510.60125732421875, + "logps/rejected": -581.7286376953125, + "loss": 0.4988, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5553991794586182, + "rewards/margins": 1.0072236061096191, + "rewards/rejected": -2.5626227855682373, + "step": 216 + }, + { + "epoch": 0.43313373253493015, + "grad_norm": 11.966288977941108, + "learning_rate": 3.500872581392238e-07, + "logits/chosen": -19.7708740234375, + "logits/rejected": -20.453022003173828, + "logps/chosen": -386.0968017578125, + "logps/rejected": -516.030029296875, + "loss": 0.4256, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.455343246459961, + "rewards/margins": 0.9787195920944214, + "rewards/rejected": -2.434062957763672, + "step": 217 + }, + { + "epoch": 0.4351297405189621, + "grad_norm": 12.989461741224307, + "learning_rate": 3.4848547739773774e-07, + "logits/chosen": -20.3544864654541, + "logits/rejected": -20.054367065429688, + "logps/chosen": -562.2991943359375, + "logps/rejected": -694.3490600585938, + "loss": 0.4304, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8581453561782837, + "rewards/margins": 1.252414345741272, + "rewards/rejected": -3.1105597019195557, + "step": 218 + }, + { + "epoch": 0.437125748502994, + "grad_norm": 13.490425487755859, + "learning_rate": 3.468788966130257e-07, + "logits/chosen": -19.16351318359375, + "logits/rejected": -18.119314193725586, + "logps/chosen": -550.3583984375, + "logps/rejected": -596.575927734375, + "loss": 0.4787, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.73578679561615, + "rewards/margins": 0.7520574927330017, + "rewards/rejected": -2.487844467163086, + "step": 219 + }, + { + "epoch": 0.43912175648702595, + "grad_norm": 19.70339532784469, + "learning_rate": 3.4526759408756857e-07, + "logits/chosen": -18.91830825805664, + "logits/rejected": -19.164518356323242, + "logps/chosen": -669.055419921875, + "logps/rejected": -708.6876220703125, + "loss": 0.469, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.031832695007324, + "rewards/margins": 0.7691054940223694, + "rewards/rejected": -2.8009378910064697, + "step": 220 + }, + { + "epoch": 0.4411177644710579, + "grad_norm": 13.350681461714572, + "learning_rate": 3.43651648353978e-07, + "logits/chosen": -18.520729064941406, + "logits/rejected": -18.25364875793457, + "logps/chosen": -519.2975463867188, + "logps/rejected": -589.70263671875, + "loss": 0.4508, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2545933723449707, + "rewards/margins": 0.9282989501953125, + "rewards/rejected": -2.182892322540283, + "step": 221 + }, + { + "epoch": 0.4431137724550898, + "grad_norm": 12.985951445968679, + "learning_rate": 3.4203113817116953e-07, + "logits/chosen": -18.722192764282227, + "logits/rejected": -18.830791473388672, + "logps/chosen": -471.1470947265625, + "logps/rejected": -526.8262329101562, + "loss": 0.4195, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1359221935272217, + "rewards/margins": 0.6822535991668701, + "rewards/rejected": -1.8181757926940918, + "step": 222 + }, + { + "epoch": 0.44510978043912175, + "grad_norm": 25.299702854421888, + "learning_rate": 3.40406142520523e-07, + "logits/chosen": -19.63787078857422, + "logits/rejected": -19.73871421813965, + "logps/chosen": -336.0172424316406, + "logps/rejected": -433.79217529296875, + "loss": 0.4428, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9037049412727356, + "rewards/margins": 0.909833550453186, + "rewards/rejected": -1.8135385513305664, + "step": 223 + }, + { + "epoch": 0.4471057884231537, + "grad_norm": 13.981874950399485, + "learning_rate": 3.387767406020343e-07, + "logits/chosen": -20.34281349182129, + "logits/rejected": -20.043916702270508, + "logps/chosen": -561.2789916992188, + "logps/rejected": -721.3209838867188, + "loss": 0.4499, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8252278566360474, + "rewards/margins": 1.3606287240982056, + "rewards/rejected": -3.185856819152832, + "step": 224 + }, + { + "epoch": 0.4491017964071856, + "grad_norm": 13.226395290628279, + "learning_rate": 3.371430118304538e-07, + "logits/chosen": -19.773094177246094, + "logits/rejected": -19.672040939331055, + "logps/chosen": -587.82177734375, + "logps/rejected": -659.1246337890625, + "loss": 0.4739, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.080528736114502, + "rewards/margins": 0.47968238592147827, + "rewards/rejected": -2.560211181640625, + "step": 225 + }, + { + "epoch": 0.45109780439121755, + "grad_norm": 21.18297840943059, + "learning_rate": 3.355050358314172e-07, + "logits/chosen": -18.7939510345459, + "logits/rejected": -19.188337326049805, + "logps/chosen": -548.7808837890625, + "logps/rejected": -600.9039916992188, + "loss": 0.4465, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9692116975784302, + "rewards/margins": 0.954479992389679, + "rewards/rejected": -2.923691749572754, + "step": 226 + }, + { + "epoch": 0.4530938123752495, + "grad_norm": 16.166215514551812, + "learning_rate": 3.338628924375638e-07, + "logits/chosen": -20.488510131835938, + "logits/rejected": -20.845476150512695, + "logps/chosen": -427.76348876953125, + "logps/rejected": -595.1253051757812, + "loss": 0.4742, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2822277545928955, + "rewards/margins": 1.3352444171905518, + "rewards/rejected": -2.6174721717834473, + "step": 227 + }, + { + "epoch": 0.4550898203592814, + "grad_norm": 14.590880103789674, + "learning_rate": 3.322166616846458e-07, + "logits/chosen": -20.881481170654297, + "logits/rejected": -20.86432647705078, + "logps/chosen": -444.5021057128906, + "logps/rejected": -491.39251708984375, + "loss": 0.4437, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5504581928253174, + "rewards/margins": 0.4571852684020996, + "rewards/rejected": -2.007643699645996, + "step": 228 + }, + { + "epoch": 0.45708582834331335, + "grad_norm": 12.92186538925852, + "learning_rate": 3.305664238076278e-07, + "logits/chosen": -20.525470733642578, + "logits/rejected": -19.75753402709961, + "logps/chosen": -395.13714599609375, + "logps/rejected": -545.7861938476562, + "loss": 0.4846, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4842098951339722, + "rewards/margins": 1.059401512145996, + "rewards/rejected": -2.543611526489258, + "step": 229 + }, + { + "epoch": 0.4590818363273453, + "grad_norm": 14.842723434798721, + "learning_rate": 3.289122592367756e-07, + "logits/chosen": -19.929006576538086, + "logits/rejected": -19.978878021240234, + "logps/chosen": -541.9116821289062, + "logps/rejected": -634.8336181640625, + "loss": 0.4491, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7504769563674927, + "rewards/margins": 1.1156678199768066, + "rewards/rejected": -2.866144895553589, + "step": 230 + }, + { + "epoch": 0.46107784431137727, + "grad_norm": 14.4942978574615, + "learning_rate": 3.272542485937368e-07, + "logits/chosen": -19.461790084838867, + "logits/rejected": -19.085315704345703, + "logps/chosen": -499.6000061035156, + "logps/rejected": -576.95849609375, + "loss": 0.4646, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5696638822555542, + "rewards/margins": 0.9026892185211182, + "rewards/rejected": -2.472352981567383, + "step": 231 + }, + { + "epoch": 0.4630738522954092, + "grad_norm": 15.223995717750787, + "learning_rate": 3.2559247268761114e-07, + "logits/chosen": -18.554140090942383, + "logits/rejected": -18.188430786132812, + "logps/chosen": -400.5260009765625, + "logps/rejected": -457.10906982421875, + "loss": 0.4356, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.025871992111206, + "rewards/margins": 0.5790168642997742, + "rewards/rejected": -1.604888916015625, + "step": 232 + }, + { + "epoch": 0.46506986027944114, + "grad_norm": 13.813707701215902, + "learning_rate": 3.2392701251101167e-07, + "logits/chosen": -18.753156661987305, + "logits/rejected": -19.0374755859375, + "logps/chosen": -587.1029663085938, + "logps/rejected": -672.4597778320312, + "loss": 0.466, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9925709962844849, + "rewards/margins": 0.7200486660003662, + "rewards/rejected": -2.7126200199127197, + "step": 233 + }, + { + "epoch": 0.46706586826347307, + "grad_norm": 16.032336387205383, + "learning_rate": 3.222579492361179e-07, + "logits/chosen": -18.763916015625, + "logits/rejected": -19.015911102294922, + "logps/chosen": -566.2586059570312, + "logps/rejected": -674.83642578125, + "loss": 0.4563, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4943814277648926, + "rewards/margins": 1.242883563041687, + "rewards/rejected": -2.73726487159729, + "step": 234 + }, + { + "epoch": 0.469061876247505, + "grad_norm": 18.953609466395125, + "learning_rate": 3.2058536421071914e-07, + "logits/chosen": -18.818294525146484, + "logits/rejected": -18.358341217041016, + "logps/chosen": -554.9633178710938, + "logps/rejected": -607.111328125, + "loss": 0.5233, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9789049625396729, + "rewards/margins": 0.6192256212234497, + "rewards/rejected": -2.598130464553833, + "step": 235 + }, + { + "epoch": 0.47105788423153694, + "grad_norm": 16.02285470658808, + "learning_rate": 3.1890933895424976e-07, + "logits/chosen": -18.656795501708984, + "logits/rejected": -18.9710636138916, + "logps/chosen": -535.3847045898438, + "logps/rejected": -593.01708984375, + "loss": 0.5067, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8721699714660645, + "rewards/margins": 0.48781105875968933, + "rewards/rejected": -2.359980821609497, + "step": 236 + }, + { + "epoch": 0.47305389221556887, + "grad_norm": 14.053925538557477, + "learning_rate": 3.172299551538164e-07, + "logits/chosen": -18.687612533569336, + "logits/rejected": -18.866008758544922, + "logps/chosen": -703.1234130859375, + "logps/rejected": -821.7389526367188, + "loss": 0.4206, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.976850986480713, + "rewards/margins": 0.9709327220916748, + "rewards/rejected": -2.9477834701538086, + "step": 237 + }, + { + "epoch": 0.4750499001996008, + "grad_norm": 13.888076003211115, + "learning_rate": 3.155472946602162e-07, + "logits/chosen": -20.23816680908203, + "logits/rejected": -19.793060302734375, + "logps/chosen": -548.01025390625, + "logps/rejected": -643.1630249023438, + "loss": 0.4279, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6879640817642212, + "rewards/margins": 0.6913233399391174, + "rewards/rejected": -2.3792872428894043, + "step": 238 + }, + { + "epoch": 0.47704590818363274, + "grad_norm": 13.837763164300675, + "learning_rate": 3.1386143948394763e-07, + "logits/chosen": -20.161375045776367, + "logits/rejected": -20.032562255859375, + "logps/chosen": -454.22332763671875, + "logps/rejected": -519.0992431640625, + "loss": 0.4894, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2950642108917236, + "rewards/margins": 0.6701027750968933, + "rewards/rejected": -1.9651669263839722, + "step": 239 + }, + { + "epoch": 0.47904191616766467, + "grad_norm": 17.528357068059776, + "learning_rate": 3.121724717912138e-07, + "logits/chosen": -20.390954971313477, + "logits/rejected": -19.70508575439453, + "logps/chosen": -586.51708984375, + "logps/rejected": -618.0750732421875, + "loss": 0.4483, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9998843669891357, + "rewards/margins": 0.6734704375267029, + "rewards/rejected": -2.6733546257019043, + "step": 240 + }, + { + "epoch": 0.4810379241516966, + "grad_norm": 14.196704035292845, + "learning_rate": 3.104804738999169e-07, + "logits/chosen": -18.2707576751709, + "logits/rejected": -18.1250057220459, + "logps/chosen": -601.682861328125, + "logps/rejected": -642.5757446289062, + "loss": 0.4509, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0958642959594727, + "rewards/margins": 0.6518892049789429, + "rewards/rejected": -2.747753381729126, + "step": 241 + }, + { + "epoch": 0.48303393213572854, + "grad_norm": 16.89149711735207, + "learning_rate": 3.087855282756475e-07, + "logits/chosen": -19.373920440673828, + "logits/rejected": -19.25455093383789, + "logps/chosen": -558.441650390625, + "logps/rejected": -685.5426635742188, + "loss": 0.4812, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7334048748016357, + "rewards/margins": 1.1145367622375488, + "rewards/rejected": -2.8479413986206055, + "step": 242 + }, + { + "epoch": 0.48502994011976047, + "grad_norm": 17.843513648960016, + "learning_rate": 3.0708771752766395e-07, + "logits/chosen": -20.650711059570312, + "logits/rejected": -19.873567581176758, + "logps/chosen": -551.3946533203125, + "logps/rejected": -695.5105590820312, + "loss": 0.4222, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.690913200378418, + "rewards/margins": 1.3804028034210205, + "rewards/rejected": -3.0713162422180176, + "step": 243 + }, + { + "epoch": 0.4870259481037924, + "grad_norm": 14.705398106965339, + "learning_rate": 3.053871244048669e-07, + "logits/chosen": -19.685924530029297, + "logits/rejected": -20.568681716918945, + "logps/chosen": -460.550537109375, + "logps/rejected": -524.4147338867188, + "loss": 0.4803, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4355939626693726, + "rewards/margins": 0.5603222846984863, + "rewards/rejected": -1.9959162473678589, + "step": 244 + }, + { + "epoch": 0.48902195608782434, + "grad_norm": 15.46176286439623, + "learning_rate": 3.036838317917658e-07, + "logits/chosen": -18.501953125, + "logits/rejected": -19.013107299804688, + "logps/chosen": -574.2833251953125, + "logps/rejected": -710.7667236328125, + "loss": 0.4301, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5793968439102173, + "rewards/margins": 1.549849033355713, + "rewards/rejected": -3.1292459964752197, + "step": 245 + }, + { + "epoch": 0.49101796407185627, + "grad_norm": 15.399821902790524, + "learning_rate": 3.0197792270443976e-07, + "logits/chosen": -20.143672943115234, + "logits/rejected": -19.312414169311523, + "logps/chosen": -465.90557861328125, + "logps/rejected": -530.7208251953125, + "loss": 0.4609, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.567827820777893, + "rewards/margins": 0.7017680406570435, + "rewards/rejected": -2.2695958614349365, + "step": 246 + }, + { + "epoch": 0.4930139720558882, + "grad_norm": 13.365354324692614, + "learning_rate": 3.002694802864912e-07, + "logits/chosen": -19.2493896484375, + "logits/rejected": -19.1527099609375, + "logps/chosen": -371.802490234375, + "logps/rejected": -477.4065246582031, + "loss": 0.414, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1133661270141602, + "rewards/margins": 0.8846907019615173, + "rewards/rejected": -1.9980566501617432, + "step": 247 + }, + { + "epoch": 0.49500998003992014, + "grad_norm": 13.171743969163947, + "learning_rate": 2.98558587804993e-07, + "logits/chosen": -20.076560974121094, + "logits/rejected": -20.700763702392578, + "logps/chosen": -525.328125, + "logps/rejected": -625.5068359375, + "loss": 0.3772, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.616844654083252, + "rewards/margins": 1.0423572063446045, + "rewards/rejected": -2.6592018604278564, + "step": 248 + }, + { + "epoch": 0.49700598802395207, + "grad_norm": 13.46509740936646, + "learning_rate": 2.968453286464312e-07, + "logits/chosen": -20.32817840576172, + "logits/rejected": -19.739133834838867, + "logps/chosen": -538.3781127929688, + "logps/rejected": -731.992431640625, + "loss": 0.3985, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7721741199493408, + "rewards/margins": 1.9263598918914795, + "rewards/rejected": -3.6985340118408203, + "step": 249 + }, + { + "epoch": 0.499001996007984, + "grad_norm": 13.647026896565563, + "learning_rate": 2.9512978631264e-07, + "logits/chosen": -19.286752700805664, + "logits/rejected": -19.01239013671875, + "logps/chosen": -533.054931640625, + "logps/rejected": -629.1688232421875, + "loss": 0.4277, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2967782020568848, + "rewards/margins": 1.074138879776001, + "rewards/rejected": -2.3709170818328857, + "step": 250 + }, + { + "epoch": 0.500998003992016, + "grad_norm": 12.433462674366762, + "learning_rate": 2.934120444167326e-07, + "logits/chosen": -19.06900978088379, + "logits/rejected": -19.819610595703125, + "logps/chosen": -459.6295166015625, + "logps/rejected": -622.0855102539062, + "loss": 0.4119, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2915412187576294, + "rewards/margins": 1.6004011631011963, + "rewards/rejected": -2.8919425010681152, + "step": 251 + }, + { + "epoch": 0.5029940119760479, + "grad_norm": 13.330997971956275, + "learning_rate": 2.916921866790256e-07, + "logits/chosen": -19.483095169067383, + "logits/rejected": -19.165523529052734, + "logps/chosen": -636.4345703125, + "logps/rejected": -667.2681884765625, + "loss": 0.4478, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.135575294494629, + "rewards/margins": 0.516217827796936, + "rewards/rejected": -2.6517930030822754, + "step": 252 + }, + { + "epoch": 0.5049900199600799, + "grad_norm": 12.284544480667769, + "learning_rate": 2.899702969229587e-07, + "logits/chosen": -19.874605178833008, + "logits/rejected": -20.1215763092041, + "logps/chosen": -472.7792663574219, + "logps/rejected": -599.262939453125, + "loss": 0.3955, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4192883968353271, + "rewards/margins": 1.275083065032959, + "rewards/rejected": -2.694371461868286, + "step": 253 + }, + { + "epoch": 0.5069860279441117, + "grad_norm": 15.336351403927143, + "learning_rate": 2.8824645907100955e-07, + "logits/chosen": -20.512561798095703, + "logits/rejected": -19.61855697631836, + "logps/chosen": -571.308349609375, + "logps/rejected": -582.1721801757812, + "loss": 0.4667, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6224905252456665, + "rewards/margins": 0.5911039710044861, + "rewards/rejected": -2.213594436645508, + "step": 254 + }, + { + "epoch": 0.5089820359281437, + "grad_norm": 14.878520589776237, + "learning_rate": 2.865207571406029e-07, + "logits/chosen": -19.37316131591797, + "logits/rejected": -19.89968490600586, + "logps/chosen": -524.258056640625, + "logps/rejected": -798.5974731445312, + "loss": 0.4355, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7529133558273315, + "rewards/margins": 2.135753870010376, + "rewards/rejected": -3.888667345046997, + "step": 255 + }, + { + "epoch": 0.5109780439121756, + "grad_norm": 16.354875819925365, + "learning_rate": 2.8479327524001633e-07, + "logits/chosen": -20.016483306884766, + "logits/rejected": -20.36672019958496, + "logps/chosen": -499.89056396484375, + "logps/rejected": -636.91943359375, + "loss": 0.4246, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5554924011230469, + "rewards/margins": 1.422688364982605, + "rewards/rejected": -2.9781811237335205, + "step": 256 + }, + { + "epoch": 0.5129740518962076, + "grad_norm": 20.065788599542866, + "learning_rate": 2.830640975642806e-07, + "logits/chosen": -21.746936798095703, + "logits/rejected": -20.40896987915039, + "logps/chosen": -654.2992553710938, + "logps/rejected": -704.3209838867188, + "loss": 0.4569, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4051055908203125, + "rewards/margins": 0.7097451090812683, + "rewards/rejected": -3.1148507595062256, + "step": 257 + }, + { + "epoch": 0.5149700598802395, + "grad_norm": 15.50568263353288, + "learning_rate": 2.8133330839107604e-07, + "logits/chosen": -19.940874099731445, + "logits/rejected": -19.60299301147461, + "logps/chosen": -545.5746459960938, + "logps/rejected": -667.3504638671875, + "loss": 0.4183, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9626977443695068, + "rewards/margins": 1.1021788120269775, + "rewards/rejected": -3.0648765563964844, + "step": 258 + }, + { + "epoch": 0.5169660678642715, + "grad_norm": 12.835594737631492, + "learning_rate": 2.796009920766253e-07, + "logits/chosen": -19.640548706054688, + "logits/rejected": -19.253395080566406, + "logps/chosen": -555.4740600585938, + "logps/rejected": -731.1700439453125, + "loss": 0.4308, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6881296634674072, + "rewards/margins": 1.8761582374572754, + "rewards/rejected": -3.5642879009246826, + "step": 259 + }, + { + "epoch": 0.5189620758483033, + "grad_norm": 13.23490536958255, + "learning_rate": 2.7786723305158135e-07, + "logits/chosen": -19.513334274291992, + "logits/rejected": -19.781173706054688, + "logps/chosen": -438.3570556640625, + "logps/rejected": -544.1898193359375, + "loss": 0.3835, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.510940432548523, + "rewards/margins": 0.972527027130127, + "rewards/rejected": -2.4834673404693604, + "step": 260 + }, + { + "epoch": 0.5209580838323353, + "grad_norm": 14.163744381342129, + "learning_rate": 2.761321158169134e-07, + "logits/chosen": -19.43235206604004, + "logits/rejected": -19.761728286743164, + "logps/chosen": -659.931396484375, + "logps/rejected": -784.9754028320312, + "loss": 0.4556, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1872830390930176, + "rewards/margins": 1.1087684631347656, + "rewards/rejected": -3.296051502227783, + "step": 261 + }, + { + "epoch": 0.5229540918163673, + "grad_norm": 15.369097445080758, + "learning_rate": 2.7439572493978737e-07, + "logits/chosen": -20.19454002380371, + "logits/rejected": -20.246570587158203, + "logps/chosen": -498.49896240234375, + "logps/rejected": -600.5203247070312, + "loss": 0.4218, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4001764059066772, + "rewards/margins": 1.2271099090576172, + "rewards/rejected": -2.627286434173584, + "step": 262 + }, + { + "epoch": 0.5249500998003992, + "grad_norm": 13.540074391202866, + "learning_rate": 2.726581450494451e-07, + "logits/chosen": -19.287321090698242, + "logits/rejected": -19.06431770324707, + "logps/chosen": -585.2114868164062, + "logps/rejected": -710.250732421875, + "loss": 0.4398, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.719393014907837, + "rewards/margins": 1.5113741159439087, + "rewards/rejected": -3.230767250061035, + "step": 263 + }, + { + "epoch": 0.5269461077844312, + "grad_norm": 16.14836459040014, + "learning_rate": 2.709194608330789e-07, + "logits/chosen": -19.105375289916992, + "logits/rejected": -19.293088912963867, + "logps/chosen": -679.0133056640625, + "logps/rejected": -880.2947998046875, + "loss": 0.4793, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0702013969421387, + "rewards/margins": 1.9545456171035767, + "rewards/rejected": -4.024746894836426, + "step": 264 + }, + { + "epoch": 0.5289421157684631, + "grad_norm": 13.93014612652964, + "learning_rate": 2.6917975703170465e-07, + "logits/chosen": -20.631940841674805, + "logits/rejected": -20.239978790283203, + "logps/chosen": -551.091064453125, + "logps/rejected": -643.3578491210938, + "loss": 0.4321, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8941808938980103, + "rewards/margins": 1.0745964050292969, + "rewards/rejected": -2.9687774181365967, + "step": 265 + }, + { + "epoch": 0.530938123752495, + "grad_norm": 13.940773373889629, + "learning_rate": 2.674391184360313e-07, + "logits/chosen": -20.835819244384766, + "logits/rejected": -20.55775260925293, + "logps/chosen": -579.5352783203125, + "logps/rejected": -668.495361328125, + "loss": 0.4576, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4945411682128906, + "rewards/margins": 0.9422330856323242, + "rewards/rejected": -3.436774730682373, + "step": 266 + }, + { + "epoch": 0.5329341317365269, + "grad_norm": 15.347566297152046, + "learning_rate": 2.6569762988232837e-07, + "logits/chosen": -20.884387969970703, + "logits/rejected": -20.510915756225586, + "logps/chosen": -554.3668823242188, + "logps/rejected": -760.4378051757812, + "loss": 0.4118, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9943656921386719, + "rewards/margins": 1.6108964681625366, + "rewards/rejected": -3.605262041091919, + "step": 267 + }, + { + "epoch": 0.5349301397205589, + "grad_norm": 15.083964912398525, + "learning_rate": 2.63955376248291e-07, + "logits/chosen": -20.903093338012695, + "logits/rejected": -20.814924240112305, + "logps/chosen": -468.0397644042969, + "logps/rejected": -557.53173828125, + "loss": 0.4452, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.621910572052002, + "rewards/margins": 0.8261189460754395, + "rewards/rejected": -2.4480295181274414, + "step": 268 + }, + { + "epoch": 0.5369261477045908, + "grad_norm": 21.070254941679767, + "learning_rate": 2.6221244244890336e-07, + "logits/chosen": -19.996227264404297, + "logits/rejected": -19.785118103027344, + "logps/chosen": -508.9876708984375, + "logps/rejected": -666.5833740234375, + "loss": 0.5038, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7276027202606201, + "rewards/margins": 1.1843074560165405, + "rewards/rejected": -2.91191029548645, + "step": 269 + }, + { + "epoch": 0.5389221556886228, + "grad_norm": 13.987780368181504, + "learning_rate": 2.6046891343229986e-07, + "logits/chosen": -19.805574417114258, + "logits/rejected": -20.00739288330078, + "logps/chosen": -516.5226440429688, + "logps/rejected": -636.6571044921875, + "loss": 0.4289, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.874422311782837, + "rewards/margins": 1.1742898225784302, + "rewards/rejected": -3.0487117767333984, + "step": 270 + }, + { + "epoch": 0.5409181636726547, + "grad_norm": 16.160524902073316, + "learning_rate": 2.5872487417562527e-07, + "logits/chosen": -18.61543083190918, + "logits/rejected": -19.55666732788086, + "logps/chosen": -656.6258544921875, + "logps/rejected": -729.110595703125, + "loss": 0.4508, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.253310203552246, + "rewards/margins": 1.0833779573440552, + "rewards/rejected": -3.33668851852417, + "step": 271 + }, + { + "epoch": 0.5429141716566867, + "grad_norm": 16.13794066326181, + "learning_rate": 2.569804096808922e-07, + "logits/chosen": -19.252918243408203, + "logits/rejected": -19.608856201171875, + "logps/chosen": -544.688232421875, + "logps/rejected": -677.6744384765625, + "loss": 0.4402, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7599573135375977, + "rewards/margins": 1.116705060005188, + "rewards/rejected": -2.876662492752075, + "step": 272 + }, + { + "epoch": 0.5449101796407185, + "grad_norm": 15.251714021976639, + "learning_rate": 2.5523560497083924e-07, + "logits/chosen": -19.35736083984375, + "logits/rejected": -19.229040145874023, + "logps/chosen": -520.7670288085938, + "logps/rejected": -589.5751953125, + "loss": 0.4691, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5088708400726318, + "rewards/margins": 0.7264382839202881, + "rewards/rejected": -2.23530912399292, + "step": 273 + }, + { + "epoch": 0.5469061876247505, + "grad_norm": 14.735102764186708, + "learning_rate": 2.5349054508478635e-07, + "logits/chosen": -20.558042526245117, + "logits/rejected": -20.376361846923828, + "logps/chosen": -581.2306518554688, + "logps/rejected": -683.8033447265625, + "loss": 0.4404, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7012970447540283, + "rewards/margins": 1.082260012626648, + "rewards/rejected": -2.783557176589966, + "step": 274 + }, + { + "epoch": 0.5489021956087824, + "grad_norm": 15.97902029283524, + "learning_rate": 2.5174531507449037e-07, + "logits/chosen": -19.68293571472168, + "logits/rejected": -19.427963256835938, + "logps/chosen": -575.7290649414062, + "logps/rejected": -604.75634765625, + "loss": 0.4362, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.912692666053772, + "rewards/margins": 0.48665523529052734, + "rewards/rejected": -2.3993477821350098, + "step": 275 + }, + { + "epoch": 0.5508982035928144, + "grad_norm": 13.384588758075404, + "learning_rate": 2.5e-07, + "logits/chosen": -19.830730438232422, + "logits/rejected": -20.07389259338379, + "logps/chosen": -524.1917114257812, + "logps/rejected": -626.604736328125, + "loss": 0.4024, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6113872528076172, + "rewards/margins": 1.069969654083252, + "rewards/rejected": -2.681356906890869, + "step": 276 + }, + { + "epoch": 0.5528942115768463, + "grad_norm": 14.356201234303862, + "learning_rate": 2.482546849255096e-07, + "logits/chosen": -20.20763397216797, + "logits/rejected": -19.939579010009766, + "logps/chosen": -454.4245300292969, + "logps/rejected": -588.796630859375, + "loss": 0.4057, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4646457433700562, + "rewards/margins": 1.3196232318878174, + "rewards/rejected": -2.784268617630005, + "step": 277 + }, + { + "epoch": 0.5548902195608783, + "grad_norm": 14.969828094442494, + "learning_rate": 2.465094549152137e-07, + "logits/chosen": -19.216341018676758, + "logits/rejected": -19.786149978637695, + "logps/chosen": -622.8614501953125, + "logps/rejected": -837.9845581054688, + "loss": 0.3914, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7940454483032227, + "rewards/margins": 1.845615267753601, + "rewards/rejected": -3.6396608352661133, + "step": 278 + }, + { + "epoch": 0.5568862275449101, + "grad_norm": 14.687976893363183, + "learning_rate": 2.447643950291608e-07, + "logits/chosen": -18.467819213867188, + "logits/rejected": -18.3898983001709, + "logps/chosen": -511.738037109375, + "logps/rejected": -543.1464233398438, + "loss": 0.4176, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5452011823654175, + "rewards/margins": 0.6293449997901917, + "rewards/rejected": -2.174546241760254, + "step": 279 + }, + { + "epoch": 0.5588822355289421, + "grad_norm": 13.84652065752045, + "learning_rate": 2.430195903191078e-07, + "logits/chosen": -19.583114624023438, + "logits/rejected": -19.34761619567871, + "logps/chosen": -517.0689697265625, + "logps/rejected": -610.4008178710938, + "loss": 0.4747, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4040230512619019, + "rewards/margins": 1.110020399093628, + "rewards/rejected": -2.5140433311462402, + "step": 280 + }, + { + "epoch": 0.5608782435129741, + "grad_norm": 16.303319683710235, + "learning_rate": 2.412751258243748e-07, + "logits/chosen": -19.688095092773438, + "logits/rejected": -19.690338134765625, + "logps/chosen": -671.603271484375, + "logps/rejected": -790.10205078125, + "loss": 0.4407, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3799023628234863, + "rewards/margins": 0.8681031465530396, + "rewards/rejected": -3.2480056285858154, + "step": 281 + }, + { + "epoch": 0.562874251497006, + "grad_norm": 14.15798233649853, + "learning_rate": 2.395310865677001e-07, + "logits/chosen": -19.338281631469727, + "logits/rejected": -18.604310989379883, + "logps/chosen": -572.2766723632812, + "logps/rejected": -635.2128295898438, + "loss": 0.4257, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1379201412200928, + "rewards/margins": 0.7467140555381775, + "rewards/rejected": -2.884634017944336, + "step": 282 + }, + { + "epoch": 0.564870259481038, + "grad_norm": 40.962955478565384, + "learning_rate": 2.3778755755109667e-07, + "logits/chosen": -19.42969512939453, + "logits/rejected": -19.07590675354004, + "logps/chosen": -734.955810546875, + "logps/rejected": -692.2105102539062, + "loss": 0.5513, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.987337350845337, + "rewards/margins": -0.13384895026683807, + "rewards/rejected": -2.8534886837005615, + "step": 283 + }, + { + "epoch": 0.5668662674650699, + "grad_norm": 13.413480487115413, + "learning_rate": 2.3604462375170903e-07, + "logits/chosen": -20.070852279663086, + "logits/rejected": -20.444026947021484, + "logps/chosen": -625.17236328125, + "logps/rejected": -736.5753173828125, + "loss": 0.3945, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.200939655303955, + "rewards/margins": 1.2015022039413452, + "rewards/rejected": -3.4024417400360107, + "step": 284 + }, + { + "epoch": 0.5688622754491018, + "grad_norm": 15.606238042142097, + "learning_rate": 2.3430237011767164e-07, + "logits/chosen": -20.74646759033203, + "logits/rejected": -20.663923263549805, + "logps/chosen": -483.0484313964844, + "logps/rejected": -695.172119140625, + "loss": 0.4272, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.707973599433899, + "rewards/margins": 1.6547538042068481, + "rewards/rejected": -3.362727165222168, + "step": 285 + }, + { + "epoch": 0.5708582834331337, + "grad_norm": 14.723616978428565, + "learning_rate": 2.3256088156396868e-07, + "logits/chosen": -20.511024475097656, + "logits/rejected": -20.550968170166016, + "logps/chosen": -386.5881042480469, + "logps/rejected": -672.30224609375, + "loss": 0.4103, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3118516206741333, + "rewards/margins": 2.176816940307617, + "rewards/rejected": -3.488668918609619, + "step": 286 + }, + { + "epoch": 0.5728542914171657, + "grad_norm": 17.104594605209122, + "learning_rate": 2.3082024296829532e-07, + "logits/chosen": -20.359363555908203, + "logits/rejected": -19.974714279174805, + "logps/chosen": -381.4059143066406, + "logps/rejected": -490.9525146484375, + "loss": 0.4721, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4457169771194458, + "rewards/margins": 0.9019349813461304, + "rewards/rejected": -2.347651958465576, + "step": 287 + }, + { + "epoch": 0.5748502994011976, + "grad_norm": 12.348244088976356, + "learning_rate": 2.2908053916692116e-07, + "logits/chosen": -19.375926971435547, + "logits/rejected": -19.011024475097656, + "logps/chosen": -460.38519287109375, + "logps/rejected": -664.692626953125, + "loss": 0.3529, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5625884532928467, + "rewards/margins": 1.8966760635375977, + "rewards/rejected": -3.4592647552490234, + "step": 288 + }, + { + "epoch": 0.5768463073852296, + "grad_norm": 18.43492666732753, + "learning_rate": 2.2734185495055498e-07, + "logits/chosen": -19.75726318359375, + "logits/rejected": -20.084196090698242, + "logps/chosen": -486.993408203125, + "logps/rejected": -599.3756103515625, + "loss": 0.4596, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9165252447128296, + "rewards/margins": 0.9002697467803955, + "rewards/rejected": -2.8167953491210938, + "step": 289 + }, + { + "epoch": 0.5788423153692615, + "grad_norm": 21.054407486526834, + "learning_rate": 2.2560427506021264e-07, + "logits/chosen": -21.07491111755371, + "logits/rejected": -20.739715576171875, + "logps/chosen": -507.4473571777344, + "logps/rejected": -605.4733276367188, + "loss": 0.4921, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7558261156082153, + "rewards/margins": 0.8664282560348511, + "rewards/rejected": -2.6222543716430664, + "step": 290 + }, + { + "epoch": 0.5808383233532934, + "grad_norm": 14.716476253084066, + "learning_rate": 2.2386788418308665e-07, + "logits/chosen": -19.136695861816406, + "logits/rejected": -19.36062240600586, + "logps/chosen": -466.1701354980469, + "logps/rejected": -597.2568359375, + "loss": 0.4432, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9274959564208984, + "rewards/margins": 0.9277093410491943, + "rewards/rejected": -2.8552052974700928, + "step": 291 + }, + { + "epoch": 0.5828343313373253, + "grad_norm": 14.446885493365698, + "learning_rate": 2.2213276694841865e-07, + "logits/chosen": -19.835556030273438, + "logits/rejected": -19.720577239990234, + "logps/chosen": -451.08441162109375, + "logps/rejected": -530.003662109375, + "loss": 0.4187, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7320913076400757, + "rewards/margins": 0.7514263987541199, + "rewards/rejected": -2.483517646789551, + "step": 292 + }, + { + "epoch": 0.5848303393213573, + "grad_norm": 17.330269076521912, + "learning_rate": 2.2039900792337474e-07, + "logits/chosen": -21.39358901977539, + "logits/rejected": -21.413705825805664, + "logps/chosen": -463.1463623046875, + "logps/rejected": -583.657470703125, + "loss": 0.4415, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7131471633911133, + "rewards/margins": 1.0683386325836182, + "rewards/rejected": -2.7814857959747314, + "step": 293 + }, + { + "epoch": 0.5868263473053892, + "grad_norm": 14.007129468785923, + "learning_rate": 2.1866669160892389e-07, + "logits/chosen": -18.738121032714844, + "logits/rejected": -19.75026512145996, + "logps/chosen": -575.805419921875, + "logps/rejected": -824.0968017578125, + "loss": 0.4254, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4536679983139038, + "rewards/margins": 2.2131245136260986, + "rewards/rejected": -3.666792392730713, + "step": 294 + }, + { + "epoch": 0.5888223552894212, + "grad_norm": 16.60108694572921, + "learning_rate": 2.1693590243571935e-07, + "logits/chosen": -19.9157657623291, + "logits/rejected": -20.169784545898438, + "logps/chosen": -573.23046875, + "logps/rejected": -665.9788818359375, + "loss": 0.3684, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9368046522140503, + "rewards/margins": 0.7889972925186157, + "rewards/rejected": -2.725801706314087, + "step": 295 + }, + { + "epoch": 0.590818363273453, + "grad_norm": 14.64139947604633, + "learning_rate": 2.152067247599837e-07, + "logits/chosen": -20.16104507446289, + "logits/rejected": -20.16781997680664, + "logps/chosen": -652.9393920898438, + "logps/rejected": -674.0302734375, + "loss": 0.4555, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5257301330566406, + "rewards/margins": 0.5331373810768127, + "rewards/rejected": -3.0588674545288086, + "step": 296 + }, + { + "epoch": 0.592814371257485, + "grad_norm": 15.25905759535438, + "learning_rate": 2.1347924285939712e-07, + "logits/chosen": -20.704172134399414, + "logits/rejected": -19.96450424194336, + "logps/chosen": -507.278564453125, + "logps/rejected": -505.9412841796875, + "loss": 0.4724, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6988550424575806, + "rewards/margins": 0.4168252944946289, + "rewards/rejected": -2.115680456161499, + "step": 297 + }, + { + "epoch": 0.5948103792415169, + "grad_norm": 19.931215357752848, + "learning_rate": 2.117535409289905e-07, + "logits/chosen": -21.177928924560547, + "logits/rejected": -21.209348678588867, + "logps/chosen": -413.62567138671875, + "logps/rejected": -724.2362060546875, + "loss": 0.4562, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4525749683380127, + "rewards/margins": 2.526947259902954, + "rewards/rejected": -3.979522228240967, + "step": 298 + }, + { + "epoch": 0.5968063872255489, + "grad_norm": 15.466070557361297, + "learning_rate": 2.100297030770413e-07, + "logits/chosen": -20.977407455444336, + "logits/rejected": -20.545534133911133, + "logps/chosen": -453.90838623046875, + "logps/rejected": -543.0181884765625, + "loss": 0.4821, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7299668788909912, + "rewards/margins": 0.792263388633728, + "rewards/rejected": -2.5222301483154297, + "step": 299 + }, + { + "epoch": 0.5988023952095808, + "grad_norm": 15.538644854464293, + "learning_rate": 2.0830781332097445e-07, + "logits/chosen": -20.28766441345215, + "logits/rejected": -19.859888076782227, + "logps/chosen": -495.1536560058594, + "logps/rejected": -636.5488891601562, + "loss": 0.4248, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4328924417495728, + "rewards/margins": 1.6545376777648926, + "rewards/rejected": -3.087430000305176, + "step": 300 + }, + { + "epoch": 0.6007984031936128, + "grad_norm": 15.125688011588869, + "learning_rate": 2.065879555832674e-07, + "logits/chosen": -19.8198299407959, + "logits/rejected": -19.781383514404297, + "logps/chosen": -614.9022216796875, + "logps/rejected": -763.259033203125, + "loss": 0.3997, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.6993727684020996, + "rewards/margins": 1.1797199249267578, + "rewards/rejected": -3.8790926933288574, + "step": 301 + }, + { + "epoch": 0.6027944111776448, + "grad_norm": 16.044735764949653, + "learning_rate": 2.0487021368736002e-07, + "logits/chosen": -19.610458374023438, + "logits/rejected": -19.683618545532227, + "logps/chosen": -607.406005859375, + "logps/rejected": -767.1768798828125, + "loss": 0.421, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3819403648376465, + "rewards/margins": 1.3107446432113647, + "rewards/rejected": -3.692685127258301, + "step": 302 + }, + { + "epoch": 0.6047904191616766, + "grad_norm": 23.962926461003274, + "learning_rate": 2.0315467135356878e-07, + "logits/chosen": -20.334712982177734, + "logits/rejected": -20.212602615356445, + "logps/chosen": -501.9886779785156, + "logps/rejected": -551.6432495117188, + "loss": 0.4439, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7299950122833252, + "rewards/margins": 0.6993128061294556, + "rewards/rejected": -2.4293079376220703, + "step": 303 + }, + { + "epoch": 0.6067864271457086, + "grad_norm": 18.10881682443083, + "learning_rate": 2.0144141219500704e-07, + "logits/chosen": -18.936782836914062, + "logits/rejected": -19.05160140991211, + "logps/chosen": -524.8377075195312, + "logps/rejected": -604.0562133789062, + "loss": 0.4201, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5715314149856567, + "rewards/margins": 0.9257473945617676, + "rewards/rejected": -2.4972786903381348, + "step": 304 + }, + { + "epoch": 0.6087824351297405, + "grad_norm": 17.494266000385768, + "learning_rate": 1.9973051971350888e-07, + "logits/chosen": -19.74258804321289, + "logits/rejected": -19.830686569213867, + "logps/chosen": -574.1022338867188, + "logps/rejected": -671.13427734375, + "loss": 0.4209, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.01725435256958, + "rewards/margins": 0.9205825328826904, + "rewards/rejected": -2.9378366470336914, + "step": 305 + }, + { + "epoch": 0.6107784431137725, + "grad_norm": 26.104189081412425, + "learning_rate": 1.980220772955602e-07, + "logits/chosen": -19.05501937866211, + "logits/rejected": -19.7154598236084, + "logps/chosen": -624.7476196289062, + "logps/rejected": -970.2384033203125, + "loss": 0.3823, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.797724962234497, + "rewards/margins": 2.543869972229004, + "rewards/rejected": -4.34159517288208, + "step": 306 + }, + { + "epoch": 0.6127744510978044, + "grad_norm": 16.989107574051708, + "learning_rate": 1.9631616820823418e-07, + "logits/chosen": -20.383739471435547, + "logits/rejected": -19.768543243408203, + "logps/chosen": -396.6142578125, + "logps/rejected": -431.7706298828125, + "loss": 0.5028, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1728373765945435, + "rewards/margins": 0.7236466407775879, + "rewards/rejected": -1.8964838981628418, + "step": 307 + }, + { + "epoch": 0.6147704590818364, + "grad_norm": 12.968487062626416, + "learning_rate": 1.9461287559513318e-07, + "logits/chosen": -20.2695369720459, + "logits/rejected": -20.062299728393555, + "logps/chosen": -569.882568359375, + "logps/rejected": -713.83984375, + "loss": 0.3679, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0745344161987305, + "rewards/margins": 1.5346819162368774, + "rewards/rejected": -3.6092166900634766, + "step": 308 + }, + { + "epoch": 0.6167664670658682, + "grad_norm": 16.448425075026044, + "learning_rate": 1.9291228247233603e-07, + "logits/chosen": -20.340492248535156, + "logits/rejected": -20.029369354248047, + "logps/chosen": -536.5447387695312, + "logps/rejected": -629.1495361328125, + "loss": 0.4095, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9390649795532227, + "rewards/margins": 0.7921390533447266, + "rewards/rejected": -2.73120379447937, + "step": 309 + }, + { + "epoch": 0.6187624750499002, + "grad_norm": 14.227557105375345, + "learning_rate": 1.9121447172435248e-07, + "logits/chosen": -19.732128143310547, + "logits/rejected": -18.906469345092773, + "logps/chosen": -546.9534301757812, + "logps/rejected": -616.367431640625, + "loss": 0.3997, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1328556537628174, + "rewards/margins": 0.8451984524726868, + "rewards/rejected": -2.9780540466308594, + "step": 310 + }, + { + "epoch": 0.6207584830339321, + "grad_norm": 18.990805165227112, + "learning_rate": 1.895195261000831e-07, + "logits/chosen": -19.93746566772461, + "logits/rejected": -18.98281478881836, + "logps/chosen": -620.0504150390625, + "logps/rejected": -729.6024169921875, + "loss": 0.4899, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4116480350494385, + "rewards/margins": 1.04122793674469, + "rewards/rejected": -3.452876091003418, + "step": 311 + }, + { + "epoch": 0.6227544910179641, + "grad_norm": 16.417731370277682, + "learning_rate": 1.8782752820878633e-07, + "logits/chosen": -20.762357711791992, + "logits/rejected": -20.430248260498047, + "logps/chosen": -538.3031616210938, + "logps/rejected": -689.8028564453125, + "loss": 0.3585, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6533321142196655, + "rewards/margins": 1.5152018070220947, + "rewards/rejected": -3.16853404045105, + "step": 312 + }, + { + "epoch": 0.624750499001996, + "grad_norm": 16.28789801821355, + "learning_rate": 1.861385605160524e-07, + "logits/chosen": -20.3260440826416, + "logits/rejected": -20.39736557006836, + "logps/chosen": -523.057861328125, + "logps/rejected": -609.2344360351562, + "loss": 0.4389, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9850291013717651, + "rewards/margins": 0.9323675036430359, + "rewards/rejected": -2.9173967838287354, + "step": 313 + }, + { + "epoch": 0.626746506986028, + "grad_norm": 15.824452080667786, + "learning_rate": 1.8445270533978386e-07, + "logits/chosen": -19.66412925720215, + "logits/rejected": -20.326953887939453, + "logps/chosen": -641.4557495117188, + "logps/rejected": -730.4495849609375, + "loss": 0.4671, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.851925849914551, + "rewards/margins": 0.7540739178657532, + "rewards/rejected": -3.6059999465942383, + "step": 314 + }, + { + "epoch": 0.6287425149700598, + "grad_norm": 13.834895066668928, + "learning_rate": 1.8277004484618357e-07, + "logits/chosen": -19.389610290527344, + "logits/rejected": -18.954692840576172, + "logps/chosen": -457.67523193359375, + "logps/rejected": -551.67822265625, + "loss": 0.4086, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5931004285812378, + "rewards/margins": 0.8220267295837402, + "rewards/rejected": -2.4151270389556885, + "step": 315 + }, + { + "epoch": 0.6307385229540918, + "grad_norm": 16.231623830784724, + "learning_rate": 1.810906610457502e-07, + "logits/chosen": -19.13648796081543, + "logits/rejected": -19.79732322692871, + "logps/chosen": -523.3819580078125, + "logps/rejected": -658.5681762695312, + "loss": 0.4065, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1683220863342285, + "rewards/margins": 1.2013542652130127, + "rewards/rejected": -3.369676351547241, + "step": 316 + }, + { + "epoch": 0.6327345309381237, + "grad_norm": 16.28726347208119, + "learning_rate": 1.7941463578928083e-07, + "logits/chosen": -21.37050437927246, + "logits/rejected": -20.991573333740234, + "logps/chosen": -530.572021484375, + "logps/rejected": -699.7788696289062, + "loss": 0.5084, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0319900512695312, + "rewards/margins": 1.6102505922317505, + "rewards/rejected": -3.6422407627105713, + "step": 317 + }, + { + "epoch": 0.6347305389221557, + "grad_norm": 14.532870476828629, + "learning_rate": 1.7774205076388205e-07, + "logits/chosen": -20.120237350463867, + "logits/rejected": -20.24286651611328, + "logps/chosen": -584.264404296875, + "logps/rejected": -733.0006103515625, + "loss": 0.411, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.146928548812866, + "rewards/margins": 1.5503309965133667, + "rewards/rejected": -3.6972596645355225, + "step": 318 + }, + { + "epoch": 0.6367265469061876, + "grad_norm": 13.681025865009596, + "learning_rate": 1.760729874889884e-07, + "logits/chosen": -18.877016067504883, + "logits/rejected": -19.01573944091797, + "logps/chosen": -565.078125, + "logps/rejected": -804.6910400390625, + "loss": 0.398, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9724178314208984, + "rewards/margins": 1.8352855443954468, + "rewards/rejected": -3.8077030181884766, + "step": 319 + }, + { + "epoch": 0.6387225548902196, + "grad_norm": 15.7704191898093, + "learning_rate": 1.744075273123889e-07, + "logits/chosen": -19.746397018432617, + "logits/rejected": -19.476728439331055, + "logps/chosen": -529.2221069335938, + "logps/rejected": -647.2295532226562, + "loss": 0.4649, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6059311628341675, + "rewards/margins": 1.3847919702529907, + "rewards/rejected": -2.990723133087158, + "step": 320 + }, + { + "epoch": 0.6407185628742516, + "grad_norm": 14.126841947457866, + "learning_rate": 1.7274575140626315e-07, + "logits/chosen": -20.280803680419922, + "logits/rejected": -20.373706817626953, + "logps/chosen": -603.7615356445312, + "logps/rejected": -756.317626953125, + "loss": 0.3544, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.087045431137085, + "rewards/margins": 1.3426419496536255, + "rewards/rejected": -3.429687261581421, + "step": 321 + }, + { + "epoch": 0.6427145708582834, + "grad_norm": 15.438444258816508, + "learning_rate": 1.710877407632244e-07, + "logits/chosen": -19.929384231567383, + "logits/rejected": -19.751436233520508, + "logps/chosen": -588.960693359375, + "logps/rejected": -618.856689453125, + "loss": 0.426, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.159414291381836, + "rewards/margins": 0.37070193886756897, + "rewards/rejected": -2.530116558074951, + "step": 322 + }, + { + "epoch": 0.6447105788423154, + "grad_norm": 14.55784121973574, + "learning_rate": 1.6943357619237225e-07, + "logits/chosen": -19.727588653564453, + "logits/rejected": -19.87441062927246, + "logps/chosen": -596.117919921875, + "logps/rejected": -776.1685180664062, + "loss": 0.3911, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8160898685455322, + "rewards/margins": 1.6515451669692993, + "rewards/rejected": -3.467634916305542, + "step": 323 + }, + { + "epoch": 0.6467065868263473, + "grad_norm": 13.660945117659038, + "learning_rate": 1.6778333831535417e-07, + "logits/chosen": -18.712209701538086, + "logits/rejected": -18.91518783569336, + "logps/chosen": -471.828369140625, + "logps/rejected": -498.8101501464844, + "loss": 0.4356, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.51183021068573, + "rewards/margins": 0.41257262229919434, + "rewards/rejected": -1.9244028329849243, + "step": 324 + }, + { + "epoch": 0.6487025948103793, + "grad_norm": 14.268490761675793, + "learning_rate": 1.6613710756243627e-07, + "logits/chosen": -20.252544403076172, + "logits/rejected": -19.898006439208984, + "logps/chosen": -504.5931396484375, + "logps/rejected": -638.2498168945312, + "loss": 0.3753, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.921488881111145, + "rewards/margins": 1.3233587741851807, + "rewards/rejected": -3.2448477745056152, + "step": 325 + }, + { + "epoch": 0.6506986027944112, + "grad_norm": 15.278109072504288, + "learning_rate": 1.6449496416858282e-07, + "logits/chosen": -20.0165958404541, + "logits/rejected": -19.995262145996094, + "logps/chosen": -691.3213500976562, + "logps/rejected": -679.55859375, + "loss": 0.4287, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.590402603149414, + "rewards/margins": 0.43861329555511475, + "rewards/rejected": -3.0290160179138184, + "step": 326 + }, + { + "epoch": 0.6526946107784432, + "grad_norm": 15.482749752170122, + "learning_rate": 1.6285698816954624e-07, + "logits/chosen": -18.340465545654297, + "logits/rejected": -18.536087036132812, + "logps/chosen": -747.0191650390625, + "logps/rejected": -799.79931640625, + "loss": 0.4452, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6365926265716553, + "rewards/margins": 0.8290849924087524, + "rewards/rejected": -3.465677261352539, + "step": 327 + }, + { + "epoch": 0.654690618762475, + "grad_norm": 39.62551655522353, + "learning_rate": 1.6122325939796578e-07, + "logits/chosen": -20.012954711914062, + "logits/rejected": -19.96734046936035, + "logps/chosen": -476.8362731933594, + "logps/rejected": -573.0825805664062, + "loss": 0.4182, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8147090673446655, + "rewards/margins": 1.109825849533081, + "rewards/rejected": -2.924534797668457, + "step": 328 + }, + { + "epoch": 0.656686626746507, + "grad_norm": 13.896657489041019, + "learning_rate": 1.5959385747947695e-07, + "logits/chosen": -20.275821685791016, + "logits/rejected": -20.038957595825195, + "logps/chosen": -496.6339416503906, + "logps/rejected": -587.96630859375, + "loss": 0.4041, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7912815809249878, + "rewards/margins": 0.8887139558792114, + "rewards/rejected": -2.679995536804199, + "step": 329 + }, + { + "epoch": 0.6586826347305389, + "grad_norm": 14.121557644039507, + "learning_rate": 1.579688618288305e-07, + "logits/chosen": -19.625347137451172, + "logits/rejected": -19.073387145996094, + "logps/chosen": -505.35906982421875, + "logps/rejected": -594.4898681640625, + "loss": 0.4132, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7679235935211182, + "rewards/margins": 1.1094515323638916, + "rewards/rejected": -2.8773748874664307, + "step": 330 + }, + { + "epoch": 0.6606786427145709, + "grad_norm": 15.286311030110486, + "learning_rate": 1.5634835164602196e-07, + "logits/chosen": -20.13866424560547, + "logits/rejected": -20.248882293701172, + "logps/chosen": -440.9212646484375, + "logps/rejected": -538.68212890625, + "loss": 0.4262, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2733371257781982, + "rewards/margins": 1.1516056060791016, + "rewards/rejected": -2.4249427318573, + "step": 331 + }, + { + "epoch": 0.6626746506986028, + "grad_norm": 15.924081453100836, + "learning_rate": 1.5473240591243149e-07, + "logits/chosen": -20.336015701293945, + "logits/rejected": -19.631010055541992, + "logps/chosen": -538.7817993164062, + "logps/rejected": -691.6004028320312, + "loss": 0.4536, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.037027597427368, + "rewards/margins": 1.2660547494888306, + "rewards/rejected": -3.3030824661254883, + "step": 332 + }, + { + "epoch": 0.6646706586826348, + "grad_norm": 15.763551518922286, + "learning_rate": 1.5312110338697427e-07, + "logits/chosen": -20.395259857177734, + "logits/rejected": -20.275157928466797, + "logps/chosen": -472.280517578125, + "logps/rejected": -671.739501953125, + "loss": 0.4072, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6096742153167725, + "rewards/margins": 1.9192960262298584, + "rewards/rejected": -3.528970241546631, + "step": 333 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 17.626302766622175, + "learning_rate": 1.5151452260226221e-07, + "logits/chosen": -20.2435302734375, + "logits/rejected": -20.33843994140625, + "logps/chosen": -616.4385375976562, + "logps/rejected": -653.4850463867188, + "loss": 0.429, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.501798629760742, + "rewards/margins": 0.9154990315437317, + "rewards/rejected": -3.417297840118408, + "step": 334 + }, + { + "epoch": 0.6686626746506986, + "grad_norm": 63.27015094274589, + "learning_rate": 1.4991274186077628e-07, + "logits/chosen": -19.318078994750977, + "logits/rejected": -18.871360778808594, + "logps/chosen": -683.593505859375, + "logps/rejected": -818.3577880859375, + "loss": 0.4233, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2589824199676514, + "rewards/margins": 1.4071542024612427, + "rewards/rejected": -3.6661362648010254, + "step": 335 + }, + { + "epoch": 0.6706586826347305, + "grad_norm": 15.79033358862706, + "learning_rate": 1.4831583923104998e-07, + "logits/chosen": -19.07425308227539, + "logits/rejected": -19.29555320739746, + "logps/chosen": -735.476318359375, + "logps/rejected": -827.9088134765625, + "loss": 0.4393, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4406747817993164, + "rewards/margins": 1.2893245220184326, + "rewards/rejected": -3.729999303817749, + "step": 336 + }, + { + "epoch": 0.6726546906187625, + "grad_norm": 17.61745253414518, + "learning_rate": 1.4672389254386457e-07, + "logits/chosen": -20.20433235168457, + "logits/rejected": -20.500490188598633, + "logps/chosen": -464.4842834472656, + "logps/rejected": -629.8096923828125, + "loss": 0.4282, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.407528042793274, + "rewards/margins": 1.6521190404891968, + "rewards/rejected": -3.059647560119629, + "step": 337 + }, + { + "epoch": 0.6746506986027944, + "grad_norm": 13.399050900409183, + "learning_rate": 1.451369793884557e-07, + "logits/chosen": -20.96167755126953, + "logits/rejected": -20.80148696899414, + "logps/chosen": -532.9072265625, + "logps/rejected": -853.7723388671875, + "loss": 0.4119, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.063987970352173, + "rewards/margins": 2.5306684970855713, + "rewards/rejected": -4.594656467437744, + "step": 338 + }, + { + "epoch": 0.6766467065868264, + "grad_norm": 14.221672502387845, + "learning_rate": 1.4355517710873182e-07, + "logits/chosen": -19.60066795349121, + "logits/rejected": -19.468841552734375, + "logps/chosen": -558.4423217773438, + "logps/rejected": -655.749267578125, + "loss": 0.3793, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9699652194976807, + "rewards/margins": 1.0505293607711792, + "rewards/rejected": -3.020494222640991, + "step": 339 + }, + { + "epoch": 0.6786427145708582, + "grad_norm": 14.369407606284954, + "learning_rate": 1.4197856279950437e-07, + "logits/chosen": -19.646114349365234, + "logits/rejected": -19.738296508789062, + "logps/chosen": -569.77392578125, + "logps/rejected": -649.2036743164062, + "loss": 0.4077, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.038227081298828, + "rewards/margins": 0.7805340886116028, + "rewards/rejected": -2.818761110305786, + "step": 340 + }, + { + "epoch": 0.6806387225548902, + "grad_norm": 14.850499236465488, + "learning_rate": 1.404072133027306e-07, + "logits/chosen": -21.245651245117188, + "logits/rejected": -20.8592529296875, + "logps/chosen": -528.7147216796875, + "logps/rejected": -652.779541015625, + "loss": 0.4052, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.072338342666626, + "rewards/margins": 1.2713418006896973, + "rewards/rejected": -3.343679904937744, + "step": 341 + }, + { + "epoch": 0.6826347305389222, + "grad_norm": 14.30640696385691, + "learning_rate": 1.388412052037682e-07, + "logits/chosen": -19.93825340270996, + "logits/rejected": -19.845121383666992, + "logps/chosen": -601.0985717773438, + "logps/rejected": -764.6405029296875, + "loss": 0.4366, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2762889862060547, + "rewards/margins": 1.2482759952545166, + "rewards/rejected": -3.524564743041992, + "step": 342 + }, + { + "epoch": 0.6846307385229541, + "grad_norm": 14.757223882174646, + "learning_rate": 1.3728061482764235e-07, + "logits/chosen": -20.238370895385742, + "logits/rejected": -19.90258026123047, + "logps/chosen": -501.98553466796875, + "logps/rejected": -651.3265380859375, + "loss": 0.4348, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6277523040771484, + "rewards/margins": 1.769914150238037, + "rewards/rejected": -3.3976664543151855, + "step": 343 + }, + { + "epoch": 0.6866267465069861, + "grad_norm": 13.524905585391163, + "learning_rate": 1.357255182353265e-07, + "logits/chosen": -19.395383834838867, + "logits/rejected": -19.172569274902344, + "logps/chosen": -535.5953369140625, + "logps/rejected": -589.7423706054688, + "loss": 0.3883, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7902706861495972, + "rewards/margins": 0.8703600764274597, + "rewards/rejected": -2.660630702972412, + "step": 344 + }, + { + "epoch": 0.688622754491018, + "grad_norm": 17.781027201144273, + "learning_rate": 1.341759912200346e-07, + "logits/chosen": -21.359405517578125, + "logits/rejected": -20.672643661499023, + "logps/chosen": -608.5810546875, + "logps/rejected": -678.4393310546875, + "loss": 0.495, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7323977947235107, + "rewards/margins": 0.7647709846496582, + "rewards/rejected": -3.497169017791748, + "step": 345 + }, + { + "epoch": 0.6906187624750499, + "grad_norm": 15.715650307532133, + "learning_rate": 1.3263210930352737e-07, + "logits/chosen": -19.987049102783203, + "logits/rejected": -19.922182083129883, + "logps/chosen": -554.219482421875, + "logps/rejected": -680.3238525390625, + "loss": 0.4275, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0270988941192627, + "rewards/margins": 1.0889631509780884, + "rewards/rejected": -3.1160621643066406, + "step": 346 + }, + { + "epoch": 0.6926147704590818, + "grad_norm": 15.43434324340175, + "learning_rate": 1.3109394773243115e-07, + "logits/chosen": -20.757238388061523, + "logits/rejected": -20.43233871459961, + "logps/chosen": -776.577880859375, + "logps/rejected": -881.4364624023438, + "loss": 0.4177, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7323343753814697, + "rewards/margins": 1.1349782943725586, + "rewards/rejected": -4.867312431335449, + "step": 347 + }, + { + "epoch": 0.6946107784431138, + "grad_norm": 15.464400218085176, + "learning_rate": 1.2956158147457114e-07, + "logits/chosen": -19.5211124420166, + "logits/rejected": -19.82351303100586, + "logps/chosen": -670.1201782226562, + "logps/rejected": -733.384765625, + "loss": 0.4515, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.685201644897461, + "rewards/margins": 0.9089731574058533, + "rewards/rejected": -3.594175100326538, + "step": 348 + }, + { + "epoch": 0.6966067864271457, + "grad_norm": 13.245301947491186, + "learning_rate": 1.2803508521531677e-07, + "logits/chosen": -20.308961868286133, + "logits/rejected": -20.309410095214844, + "logps/chosen": -480.03692626953125, + "logps/rejected": -580.734375, + "loss": 0.3761, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.045827865600586, + "rewards/margins": 1.016439437866211, + "rewards/rejected": -3.062267303466797, + "step": 349 + }, + { + "epoch": 0.6986027944111777, + "grad_norm": 15.01774170684481, + "learning_rate": 1.265145333539423e-07, + "logits/chosen": -19.670490264892578, + "logits/rejected": -20.109798431396484, + "logps/chosen": -478.83978271484375, + "logps/rejected": -550.7167358398438, + "loss": 0.4211, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8340487480163574, + "rewards/margins": 0.8229854106903076, + "rewards/rejected": -2.657033920288086, + "step": 350 + }, + { + "epoch": 0.7005988023952096, + "grad_norm": 15.019733347079256, + "learning_rate": 1.2500000000000005e-07, + "logits/chosen": -20.130739212036133, + "logits/rejected": -20.091732025146484, + "logps/chosen": -595.3517456054688, + "logps/rejected": -649.5227661132812, + "loss": 0.4201, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.877355933189392, + "rewards/margins": 0.6590969562530518, + "rewards/rejected": -2.5364530086517334, + "step": 351 + }, + { + "epoch": 0.7025948103792415, + "grad_norm": 15.114776277100695, + "learning_rate": 1.234915589697091e-07, + "logits/chosen": -20.528398513793945, + "logits/rejected": -20.764583587646484, + "logps/chosen": -487.8934326171875, + "logps/rejected": -634.1742553710938, + "loss": 0.4114, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5121595859527588, + "rewards/margins": 1.2231650352478027, + "rewards/rejected": -2.7353246212005615, + "step": 352 + }, + { + "epoch": 0.7045908183632734, + "grad_norm": 13.6256765847296, + "learning_rate": 1.2198928378235715e-07, + "logits/chosen": -19.799575805664062, + "logits/rejected": -19.218843460083008, + "logps/chosen": -591.076904296875, + "logps/rejected": -690.492919921875, + "loss": 0.4153, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.067112684249878, + "rewards/margins": 0.8536612391471863, + "rewards/rejected": -2.920773983001709, + "step": 353 + }, + { + "epoch": 0.7065868263473054, + "grad_norm": 15.080531725450582, + "learning_rate": 1.2049324765671747e-07, + "logits/chosen": -19.41089630126953, + "logits/rejected": -19.16267967224121, + "logps/chosen": -424.4087829589844, + "logps/rejected": -470.61358642578125, + "loss": 0.4255, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4005568027496338, + "rewards/margins": 0.6558549404144287, + "rewards/rejected": -2.0564117431640625, + "step": 354 + }, + { + "epoch": 0.7085828343313373, + "grad_norm": 15.234160764908268, + "learning_rate": 1.1900352350748024e-07, + "logits/chosen": -20.29700469970703, + "logits/rejected": -19.255632400512695, + "logps/chosen": -559.0546264648438, + "logps/rejected": -733.3956298828125, + "loss": 0.4042, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0236408710479736, + "rewards/margins": 1.7938932180404663, + "rewards/rejected": -3.8175342082977295, + "step": 355 + }, + { + "epoch": 0.7105788423153693, + "grad_norm": 13.932645173640877, + "learning_rate": 1.175201839416988e-07, + "logits/chosen": -19.759885787963867, + "logits/rejected": -19.602773666381836, + "logps/chosen": -538.2057495117188, + "logps/rejected": -627.2590942382812, + "loss": 0.4119, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7387853860855103, + "rewards/margins": 1.22702956199646, + "rewards/rejected": -2.9658150672912598, + "step": 356 + }, + { + "epoch": 0.7125748502994012, + "grad_norm": 13.442801821898552, + "learning_rate": 1.1604330125525078e-07, + "logits/chosen": -21.212749481201172, + "logits/rejected": -20.86857795715332, + "logps/chosen": -528.8177490234375, + "logps/rejected": -667.633056640625, + "loss": 0.3941, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0190160274505615, + "rewards/margins": 1.2670211791992188, + "rewards/rejected": -3.286036968231201, + "step": 357 + }, + { + "epoch": 0.7145708582834331, + "grad_norm": 14.006526788598732, + "learning_rate": 1.1457294742931506e-07, + "logits/chosen": -18.911666870117188, + "logits/rejected": -18.453922271728516, + "logps/chosen": -758.29443359375, + "logps/rejected": -801.0388793945312, + "loss": 0.428, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5985617637634277, + "rewards/margins": 0.4230879545211792, + "rewards/rejected": -3.0216493606567383, + "step": 358 + }, + { + "epoch": 0.716566866267465, + "grad_norm": 12.614257733242685, + "learning_rate": 1.1310919412686245e-07, + "logits/chosen": -20.433425903320312, + "logits/rejected": -20.836620330810547, + "logps/chosen": -476.3095397949219, + "logps/rejected": -570.728515625, + "loss": 0.4237, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7495883703231812, + "rewards/margins": 1.0333952903747559, + "rewards/rejected": -2.7829835414886475, + "step": 359 + }, + { + "epoch": 0.718562874251497, + "grad_norm": 12.909204559523792, + "learning_rate": 1.11652112689164e-07, + "logits/chosen": -20.962121963500977, + "logits/rejected": -20.562122344970703, + "logps/chosen": -440.939453125, + "logps/rejected": -604.5936889648438, + "loss": 0.4066, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9059672355651855, + "rewards/margins": 1.3158305883407593, + "rewards/rejected": -3.2217981815338135, + "step": 360 + }, + { + "epoch": 0.720558882235529, + "grad_norm": 15.08942520449824, + "learning_rate": 1.1020177413231332e-07, + "logits/chosen": -18.539854049682617, + "logits/rejected": -18.9150447845459, + "logps/chosen": -570.9627685546875, + "logps/rejected": -645.0427856445312, + "loss": 0.4283, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8263628482818604, + "rewards/margins": 0.8770366907119751, + "rewards/rejected": -2.703399658203125, + "step": 361 + }, + { + "epoch": 0.7225548902195609, + "grad_norm": 14.430827963578226, + "learning_rate": 1.0875824914376553e-07, + "logits/chosen": -19.89458465576172, + "logits/rejected": -19.718782424926758, + "logps/chosen": -523.622802734375, + "logps/rejected": -695.9458618164062, + "loss": 0.3941, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9959505796432495, + "rewards/margins": 1.525893211364746, + "rewards/rejected": -3.521843910217285, + "step": 362 + }, + { + "epoch": 0.7245508982035929, + "grad_norm": 16.868494488083627, + "learning_rate": 1.073216080788921e-07, + "logits/chosen": -19.86290168762207, + "logits/rejected": -20.225648880004883, + "logps/chosen": -545.2635498046875, + "logps/rejected": -563.8896484375, + "loss": 0.4354, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3746070861816406, + "rewards/margins": 0.27832096815109253, + "rewards/rejected": -2.652928352355957, + "step": 363 + }, + { + "epoch": 0.7265469061876247, + "grad_norm": 13.034702828139887, + "learning_rate": 1.058919209575517e-07, + "logits/chosen": -19.361488342285156, + "logits/rejected": -19.3696346282959, + "logps/chosen": -430.3180847167969, + "logps/rejected": -602.2634887695312, + "loss": 0.3693, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5194342136383057, + "rewards/margins": 1.5140842199325562, + "rewards/rejected": -3.0335183143615723, + "step": 364 + }, + { + "epoch": 0.7285429141716567, + "grad_norm": 15.45308792548943, + "learning_rate": 1.0446925746067766e-07, + "logits/chosen": -19.043651580810547, + "logits/rejected": -17.862253189086914, + "logps/chosen": -552.1558227539062, + "logps/rejected": -660.024658203125, + "loss": 0.448, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.276698112487793, + "rewards/margins": 0.8926166892051697, + "rewards/rejected": -3.1693146228790283, + "step": 365 + }, + { + "epoch": 0.7305389221556886, + "grad_norm": 13.439945637326407, + "learning_rate": 1.0305368692688174e-07, + "logits/chosen": -19.70649528503418, + "logits/rejected": -20.203184127807617, + "logps/chosen": -549.0379638671875, + "logps/rejected": -643.2531127929688, + "loss": 0.4017, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.016526699066162, + "rewards/margins": 0.9828741550445557, + "rewards/rejected": -2.999401092529297, + "step": 366 + }, + { + "epoch": 0.7325349301397206, + "grad_norm": 16.270374860886132, + "learning_rate": 1.0164527834907466e-07, + "logits/chosen": -20.50018310546875, + "logits/rejected": -20.18191146850586, + "logps/chosen": -502.46173095703125, + "logps/rejected": -628.9412841796875, + "loss": 0.4483, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9385945796966553, + "rewards/margins": 1.0971810817718506, + "rewards/rejected": -3.0357754230499268, + "step": 367 + }, + { + "epoch": 0.7345309381237525, + "grad_norm": 17.691856671277147, + "learning_rate": 1.0024410037110356e-07, + "logits/chosen": -21.22451400756836, + "logits/rejected": -21.156221389770508, + "logps/chosen": -495.2363586425781, + "logps/rejected": -613.3316650390625, + "loss": 0.3957, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.790381669998169, + "rewards/margins": 1.1306113004684448, + "rewards/rejected": -2.920992851257324, + "step": 368 + }, + { + "epoch": 0.7365269461077845, + "grad_norm": 17.71870669147819, + "learning_rate": 9.885022128440629e-08, + "logits/chosen": -20.220726013183594, + "logits/rejected": -20.3509578704834, + "logps/chosen": -580.0787353515625, + "logps/rejected": -699.8051147460938, + "loss": 0.3823, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1798484325408936, + "rewards/margins": 1.233259677886963, + "rewards/rejected": -3.4131078720092773, + "step": 369 + }, + { + "epoch": 0.7385229540918163, + "grad_norm": 13.967093829846899, + "learning_rate": 9.746370902468309e-08, + "logits/chosen": -19.74071502685547, + "logits/rejected": -19.97284507751465, + "logps/chosen": -559.4572143554688, + "logps/rejected": -725.63037109375, + "loss": 0.3773, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8109045028686523, + "rewards/margins": 1.5691444873809814, + "rewards/rejected": -3.380049228668213, + "step": 370 + }, + { + "epoch": 0.7405189620758483, + "grad_norm": 15.427869059063354, + "learning_rate": 9.608463116858542e-08, + "logits/chosen": -20.12067413330078, + "logits/rejected": -20.155805587768555, + "logps/chosen": -467.1358642578125, + "logps/rejected": -623.1273193359375, + "loss": 0.4187, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.784905195236206, + "rewards/margins": 1.2622867822647095, + "rewards/rejected": -3.047191858291626, + "step": 371 + }, + { + "epoch": 0.7425149700598802, + "grad_norm": 14.788752344708447, + "learning_rate": 9.471305493042242e-08, + "logits/chosen": -19.2255859375, + "logits/rejected": -20.1333065032959, + "logps/chosen": -635.54736328125, + "logps/rejected": -853.2119140625, + "loss": 0.4114, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3935046195983887, + "rewards/margins": 1.8072738647460938, + "rewards/rejected": -4.200778961181641, + "step": 372 + }, + { + "epoch": 0.7445109780439122, + "grad_norm": 17.495642850120454, + "learning_rate": 9.334904715888494e-08, + "logits/chosen": -20.933143615722656, + "logits/rejected": -20.832096099853516, + "logps/chosen": -411.86676025390625, + "logps/rejected": -503.8995666503906, + "loss": 0.4113, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6132818460464478, + "rewards/margins": 1.026564121246338, + "rewards/rejected": -2.639845848083496, + "step": 373 + }, + { + "epoch": 0.7465069860279441, + "grad_norm": 16.02805046832174, + "learning_rate": 9.199267433378727e-08, + "logits/chosen": -20.639259338378906, + "logits/rejected": -20.546987533569336, + "logps/chosen": -523.702880859375, + "logps/rejected": -672.5869750976562, + "loss": 0.4, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8315907716751099, + "rewards/margins": 1.46940016746521, + "rewards/rejected": -3.3009910583496094, + "step": 374 + }, + { + "epoch": 0.7485029940119761, + "grad_norm": 15.457442027742408, + "learning_rate": 9.064400256282755e-08, + "logits/chosen": -20.409318923950195, + "logits/rejected": -19.4872989654541, + "logps/chosen": -578.9743041992188, + "logps/rejected": -649.5741577148438, + "loss": 0.389, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9740164279937744, + "rewards/margins": 0.8397660255432129, + "rewards/rejected": -2.8137826919555664, + "step": 375 + }, + { + "epoch": 0.7504990019960079, + "grad_norm": 13.600344719262973, + "learning_rate": 8.930309757836516e-08, + "logits/chosen": -21.005847930908203, + "logits/rejected": -20.700740814208984, + "logps/chosen": -519.6953735351562, + "logps/rejected": -564.7655639648438, + "loss": 0.4088, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.914379358291626, + "rewards/margins": 0.7501688599586487, + "rewards/rejected": -2.66454815864563, + "step": 376 + }, + { + "epoch": 0.7524950099800399, + "grad_norm": 16.875851886052402, + "learning_rate": 8.797002473421727e-08, + "logits/chosen": -20.600692749023438, + "logits/rejected": -20.884218215942383, + "logps/chosen": -666.4806518554688, + "logps/rejected": -765.6790771484375, + "loss": 0.4457, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1328279972076416, + "rewards/margins": 0.959656834602356, + "rewards/rejected": -4.092484951019287, + "step": 377 + }, + { + "epoch": 0.7544910179640718, + "grad_norm": 17.47565380119221, + "learning_rate": 8.664484900247363e-08, + "logits/chosen": -21.40182876586914, + "logits/rejected": -21.741256713867188, + "logps/chosen": -405.0975341796875, + "logps/rejected": -557.7005615234375, + "loss": 0.5039, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.402137041091919, + "rewards/margins": 1.5041407346725464, + "rewards/rejected": -2.906277894973755, + "step": 378 + }, + { + "epoch": 0.7564870259481038, + "grad_norm": 16.636183378171886, + "learning_rate": 8.532763497032986e-08, + "logits/chosen": -22.009544372558594, + "logits/rejected": -22.248573303222656, + "logps/chosen": -501.2933654785156, + "logps/rejected": -655.1139526367188, + "loss": 0.474, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6780036687850952, + "rewards/margins": 1.678252100944519, + "rewards/rejected": -3.3562557697296143, + "step": 379 + }, + { + "epoch": 0.7584830339321357, + "grad_norm": 14.487354680287742, + "learning_rate": 8.401844683693959e-08, + "logits/chosen": -20.488567352294922, + "logits/rejected": -20.180971145629883, + "logps/chosen": -434.21392822265625, + "logps/rejected": -620.0761108398438, + "loss": 0.4048, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5215814113616943, + "rewards/margins": 1.4483155012130737, + "rewards/rejected": -2.9698970317840576, + "step": 380 + }, + { + "epoch": 0.7604790419161677, + "grad_norm": 16.309393285411627, + "learning_rate": 8.271734841028552e-08, + "logits/chosen": -21.052242279052734, + "logits/rejected": -20.410202026367188, + "logps/chosen": -530.8050537109375, + "logps/rejected": -799.7926635742188, + "loss": 0.4111, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0542612075805664, + "rewards/margins": 2.227598190307617, + "rewards/rejected": -4.281859874725342, + "step": 381 + }, + { + "epoch": 0.7624750499001997, + "grad_norm": 15.18164269630782, + "learning_rate": 8.142440310406923e-08, + "logits/chosen": -20.607084274291992, + "logits/rejected": -20.46668815612793, + "logps/chosen": -568.8605346679688, + "logps/rejected": -732.0130615234375, + "loss": 0.3876, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2562711238861084, + "rewards/margins": 1.3708157539367676, + "rewards/rejected": -3.627086877822876, + "step": 382 + }, + { + "epoch": 0.7644710578842315, + "grad_norm": 15.843870118454515, + "learning_rate": 8.013967393462093e-08, + "logits/chosen": -19.776172637939453, + "logits/rejected": -20.000160217285156, + "logps/chosen": -602.1343383789062, + "logps/rejected": -681.888671875, + "loss": 0.4666, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8674306869506836, + "rewards/margins": 0.9732953906059265, + "rewards/rejected": -2.840725898742676, + "step": 383 + }, + { + "epoch": 0.7664670658682635, + "grad_norm": 16.409007623679, + "learning_rate": 7.886322351782782e-08, + "logits/chosen": -19.91303825378418, + "logits/rejected": -19.690052032470703, + "logps/chosen": -669.9967041015625, + "logps/rejected": -714.0025634765625, + "loss": 0.3973, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5477182865142822, + "rewards/margins": 0.8782477378845215, + "rewards/rejected": -3.4259660243988037, + "step": 384 + }, + { + "epoch": 0.7684630738522954, + "grad_norm": 16.487955137219046, + "learning_rate": 7.759511406608255e-08, + "logits/chosen": -20.416107177734375, + "logits/rejected": -20.148622512817383, + "logps/chosen": -500.3988952636719, + "logps/rejected": -588.0291748046875, + "loss": 0.4116, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.080512762069702, + "rewards/margins": 0.6933166980743408, + "rewards/rejected": -2.773829460144043, + "step": 385 + }, + { + "epoch": 0.7704590818363274, + "grad_norm": 12.101869383533744, + "learning_rate": 7.633540738525066e-08, + "logits/chosen": -20.018726348876953, + "logits/rejected": -20.265872955322266, + "logps/chosen": -630.1484375, + "logps/rejected": -860.3380126953125, + "loss": 0.3322, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.424983501434326, + "rewards/margins": 1.9725841283798218, + "rewards/rejected": -4.3975677490234375, + "step": 386 + }, + { + "epoch": 0.7724550898203593, + "grad_norm": 15.343874076897551, + "learning_rate": 7.508416487165862e-08, + "logits/chosen": -19.619956970214844, + "logits/rejected": -19.398181915283203, + "logps/chosen": -536.6625366210938, + "logps/rejected": -645.3226318359375, + "loss": 0.4061, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8896986246109009, + "rewards/margins": 1.0346204042434692, + "rewards/rejected": -2.92431902885437, + "step": 387 + }, + { + "epoch": 0.7744510978043913, + "grad_norm": 18.888211698764017, + "learning_rate": 7.384144750910132e-08, + "logits/chosen": -20.33465003967285, + "logits/rejected": -20.288190841674805, + "logps/chosen": -664.9688720703125, + "logps/rejected": -774.9567260742188, + "loss": 0.4651, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5558035373687744, + "rewards/margins": 1.0801535844802856, + "rewards/rejected": -3.6359572410583496, + "step": 388 + }, + { + "epoch": 0.7764471057884231, + "grad_norm": 15.472053370205279, + "learning_rate": 7.260731586586982e-08, + "logits/chosen": -21.066429138183594, + "logits/rejected": -21.032623291015625, + "logps/chosen": -608.3118286132812, + "logps/rejected": -721.432861328125, + "loss": 0.4025, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.139827251434326, + "rewards/margins": 1.1138910055160522, + "rewards/rejected": -3.2537178993225098, + "step": 389 + }, + { + "epoch": 0.7784431137724551, + "grad_norm": 16.192414128371517, + "learning_rate": 7.138183009179921e-08, + "logits/chosen": -19.891376495361328, + "logits/rejected": -19.18803596496582, + "logps/chosen": -497.37030029296875, + "logps/rejected": -598.454345703125, + "loss": 0.4694, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.445165753364563, + "rewards/margins": 1.31147038936615, + "rewards/rejected": -2.756636142730713, + "step": 390 + }, + { + "epoch": 0.780439121756487, + "grad_norm": 12.312288434176681, + "learning_rate": 7.016504991533726e-08, + "logits/chosen": -20.122936248779297, + "logits/rejected": -19.829761505126953, + "logps/chosen": -536.530517578125, + "logps/rejected": -579.5504760742188, + "loss": 0.417, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.375357151031494, + "rewards/margins": 0.6135118007659912, + "rewards/rejected": -2.9888687133789062, + "step": 391 + }, + { + "epoch": 0.782435129740519, + "grad_norm": 14.868394634270913, + "learning_rate": 6.895703464063318e-08, + "logits/chosen": -20.536144256591797, + "logits/rejected": -20.883195877075195, + "logps/chosen": -415.7984619140625, + "logps/rejected": -509.4052734375, + "loss": 0.4212, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4413790702819824, + "rewards/margins": 0.9446898102760315, + "rewards/rejected": -2.386068820953369, + "step": 392 + }, + { + "epoch": 0.7844311377245509, + "grad_norm": 12.507727828796943, + "learning_rate": 6.775784314464716e-08, + "logits/chosen": -19.6154727935791, + "logits/rejected": -20.296825408935547, + "logps/chosen": -599.3359375, + "logps/rejected": -793.7149658203125, + "loss": 0.3785, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3708200454711914, + "rewards/margins": 1.5517452955245972, + "rewards/rejected": -3.922565460205078, + "step": 393 + }, + { + "epoch": 0.7864271457085829, + "grad_norm": 16.10552971520174, + "learning_rate": 6.656753387428088e-08, + "logits/chosen": -20.372236251831055, + "logits/rejected": -20.05748748779297, + "logps/chosen": -546.3995361328125, + "logps/rejected": -587.6973266601562, + "loss": 0.4116, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1204771995544434, + "rewards/margins": 0.6429656744003296, + "rewards/rejected": -2.7634429931640625, + "step": 394 + }, + { + "epoch": 0.7884231536926147, + "grad_norm": 13.213009124377368, + "learning_rate": 6.538616484352902e-08, + "logits/chosen": -20.40473747253418, + "logits/rejected": -20.52154541015625, + "logps/chosen": -517.912841796875, + "logps/rejected": -728.2640991210938, + "loss": 0.3337, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.709905743598938, + "rewards/margins": 1.5683302879333496, + "rewards/rejected": -3.278235912322998, + "step": 395 + }, + { + "epoch": 0.7904191616766467, + "grad_norm": 15.982324852281675, + "learning_rate": 6.42137936306514e-08, + "logits/chosen": -20.089094161987305, + "logits/rejected": -19.609643936157227, + "logps/chosen": -516.9925537109375, + "logps/rejected": -605.076171875, + "loss": 0.3902, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7272955179214478, + "rewards/margins": 0.9996792078018188, + "rewards/rejected": -2.7269749641418457, + "step": 396 + }, + { + "epoch": 0.7924151696606786, + "grad_norm": 14.080014810725027, + "learning_rate": 6.305047737536707e-08, + "logits/chosen": -19.353548049926758, + "logits/rejected": -19.089752197265625, + "logps/chosen": -640.895751953125, + "logps/rejected": -749.78466796875, + "loss": 0.3926, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3223495483398438, + "rewards/margins": 1.1758687496185303, + "rewards/rejected": -3.498218536376953, + "step": 397 + }, + { + "epoch": 0.7944111776447106, + "grad_norm": 15.67403057877649, + "learning_rate": 6.189627277606893e-08, + "logits/chosen": -19.294418334960938, + "logits/rejected": -20.157922744750977, + "logps/chosen": -514.27490234375, + "logps/rejected": -710.0952758789062, + "loss": 0.4427, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.871453046798706, + "rewards/margins": 1.5966830253601074, + "rewards/rejected": -3.4681363105773926, + "step": 398 + }, + { + "epoch": 0.7964071856287425, + "grad_norm": 17.52367685410827, + "learning_rate": 6.075123608706093e-08, + "logits/chosen": -20.73387908935547, + "logits/rejected": -20.402624130249023, + "logps/chosen": -612.9823608398438, + "logps/rejected": -689.6102294921875, + "loss": 0.4862, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1650824546813965, + "rewards/margins": 0.8385013937950134, + "rewards/rejected": -3.0035839080810547, + "step": 399 + }, + { + "epoch": 0.7984031936127745, + "grad_norm": 14.898718189895368, + "learning_rate": 5.961542311581585e-08, + "logits/chosen": -20.519458770751953, + "logits/rejected": -20.2457332611084, + "logps/chosen": -501.14654541015625, + "logps/rejected": -638.9808959960938, + "loss": 0.3868, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.750710129737854, + "rewards/margins": 1.4607923030853271, + "rewards/rejected": -3.2115025520324707, + "step": 400 + }, + { + "epoch": 0.8003992015968064, + "grad_norm": 14.068518132884332, + "learning_rate": 5.848888922025552e-08, + "logits/chosen": -19.87328338623047, + "logits/rejected": -19.895517349243164, + "logps/chosen": -479.4627685546875, + "logps/rejected": -598.6085815429688, + "loss": 0.3979, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.564753770828247, + "rewards/margins": 1.1341055631637573, + "rewards/rejected": -2.698859453201294, + "step": 401 + }, + { + "epoch": 0.8023952095808383, + "grad_norm": 15.767966201578563, + "learning_rate": 5.737168930605271e-08, + "logits/chosen": -21.595972061157227, + "logits/rejected": -21.527589797973633, + "logps/chosen": -449.24468994140625, + "logps/rejected": -512.97998046875, + "loss": 0.452, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6318031549453735, + "rewards/margins": 0.8296475410461426, + "rewards/rejected": -2.4614505767822266, + "step": 402 + }, + { + "epoch": 0.8043912175648703, + "grad_norm": 30.339527885915878, + "learning_rate": 5.6263877823955115e-08, + "logits/chosen": -21.442642211914062, + "logits/rejected": -20.76935386657715, + "logps/chosen": -395.5703125, + "logps/rejected": -497.1997375488281, + "loss": 0.4302, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4801020622253418, + "rewards/margins": 0.9958056211471558, + "rewards/rejected": -2.475907802581787, + "step": 403 + }, + { + "epoch": 0.8063872255489022, + "grad_norm": 16.001479137202306, + "learning_rate": 5.516550876713141e-08, + "logits/chosen": -19.882049560546875, + "logits/rejected": -19.06197738647461, + "logps/chosen": -391.6086120605469, + "logps/rejected": -492.8154296875, + "loss": 0.4042, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2724910974502563, + "rewards/margins": 1.1560478210449219, + "rewards/rejected": -2.4285390377044678, + "step": 404 + } + ], + "logging_steps": 1, + "max_steps": 501, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 101, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}