{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982003599280144, "eval_steps": 500, "global_step": 416, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.1904761904761906e-07, "logits/chosen": -0.2390434443950653, "logits/rejected": -0.2346378117799759, "logps/chosen": -439.6516418457031, "logps/rejected": -369.9039611816406, "loss": 0.3771, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 1.1904761904761906e-06, "logits/chosen": -0.2123832106590271, "logits/rejected": -0.09928727895021439, "logps/chosen": -406.4107971191406, "logps/rejected": -410.1495361328125, "loss": 0.3772, "rewards/accuracies": 0.375, "rewards/chosen": 0.000164651675731875, "rewards/margins": -4.650545815820806e-05, "rewards/rejected": 0.0002111571084242314, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.380952380952381e-06, "logits/chosen": -0.1769140362739563, "logits/rejected": -0.06540749222040176, "logps/chosen": -407.98834228515625, "logps/rejected": -404.8681335449219, "loss": 0.3724, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0006718490039929748, "rewards/margins": 0.0006276529747992754, "rewards/rejected": -0.0012995019787922502, "step": 20 }, { "epoch": 0.07, "learning_rate": 3.5714285714285718e-06, "logits/chosen": -0.18587307631969452, "logits/rejected": -0.07609504461288452, "logps/chosen": -422.6565856933594, "logps/rejected": -426.0921325683594, "loss": 0.371, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005426889285445213, "rewards/margins": 0.0015309697482734919, "rewards/rejected": -0.0069578588008880615, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.761904761904762e-06, "logits/chosen": -0.2079353779554367, "logits/rejected": -0.11336110532283783, "logps/chosen": -440.435791015625, "logps/rejected": -448.8382263183594, "loss": 0.3568, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.01811929978430271, "rewards/margins": 0.014013724401593208, "rewards/rejected": -0.03213302046060562, "step": 40 }, { "epoch": 0.12, "learning_rate": 4.994357350311441e-06, "logits/chosen": -0.16615493595600128, "logits/rejected": -0.17620348930358887, "logps/chosen": -484.8639221191406, "logps/rejected": -528.6168823242188, "loss": 0.3408, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.06840778887271881, "rewards/margins": 0.045049719512462616, "rewards/rejected": -0.11345750093460083, "step": 50 }, { "epoch": 0.14, "learning_rate": 4.97147773390341e-06, "logits/chosen": -0.1688239425420761, "logits/rejected": -0.1010187491774559, "logps/chosen": -615.0237426757812, "logps/rejected": -678.5064697265625, "loss": 0.3256, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.161706805229187, "rewards/margins": 0.09166954457759857, "rewards/rejected": -0.2533763647079468, "step": 60 }, { "epoch": 0.17, "learning_rate": 4.931169703639282e-06, "logits/chosen": -0.12329418957233429, "logits/rejected": -0.12814375758171082, "logps/chosen": -588.9280395507812, "logps/rejected": -631.6210327148438, "loss": 0.338, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.18931646645069122, "rewards/margins": 0.07589760422706604, "rewards/rejected": -0.26521408557891846, "step": 70 }, { "epoch": 0.19, "learning_rate": 4.873717504456219e-06, "logits/chosen": -0.1520806849002838, "logits/rejected": -0.09646686166524887, "logps/chosen": -656.6085205078125, "logps/rejected": -690.5023193359375, "loss": 0.3243, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.19273978471755981, "rewards/margins": 0.08054682612419128, "rewards/rejected": -0.2732866108417511, "step": 80 }, { "epoch": 0.22, "learning_rate": 4.7995262788689865e-06, "logits/chosen": -0.1520826667547226, "logits/rejected": -0.10331418365240097, "logps/chosen": -614.34814453125, "logps/rejected": -715.5487060546875, "loss": 0.3209, "rewards/accuracies": 0.625, "rewards/chosen": -0.19910724461078644, "rewards/margins": 0.09421838819980621, "rewards/rejected": -0.29332563281059265, "step": 90 }, { "epoch": 0.24, "learning_rate": 4.709119209978242e-06, "logits/chosen": -0.17085638642311096, "logits/rejected": -0.10925278812646866, "logps/chosen": -590.4300537109375, "logps/rejected": -633.34814453125, "loss": 0.3405, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2056376188993454, "rewards/margins": 0.06741281598806381, "rewards/rejected": -0.2730504274368286, "step": 100 }, { "epoch": 0.26, "learning_rate": 4.603133832077953e-06, "logits/chosen": -0.08658218383789062, "logits/rejected": -0.05598406121134758, "logps/chosen": -594.2792358398438, "logps/rejected": -634.0743408203125, "loss": 0.3125, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.17107120156288147, "rewards/margins": 0.07311462610960007, "rewards/rejected": -0.24418580532073975, "step": 110 }, { "epoch": 0.29, "learning_rate": 4.482317534878901e-06, "logits/chosen": -0.1406688690185547, "logits/rejected": -0.03664925694465637, "logps/chosen": -586.7513427734375, "logps/rejected": -677.1576538085938, "loss": 0.3116, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.16716982424259186, "rewards/margins": 0.09580481797456741, "rewards/rejected": -0.26297464966773987, "step": 120 }, { "epoch": 0.31, "learning_rate": 4.3475222930516484e-06, "logits/chosen": -0.15043699741363525, "logits/rejected": -0.040714383125305176, "logps/chosen": -669.5628051757812, "logps/rejected": -731.3157348632812, "loss": 0.3246, "rewards/accuracies": 0.625, "rewards/chosen": -0.21227452158927917, "rewards/margins": 0.08657745271921158, "rewards/rejected": -0.29885196685791016, "step": 130 }, { "epoch": 0.34, "learning_rate": 4.199698658255298e-06, "logits/chosen": -0.12090220302343369, "logits/rejected": -0.06969845294952393, "logps/chosen": -652.12939453125, "logps/rejected": -624.6866455078125, "loss": 0.313, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.20643600821495056, "rewards/margins": 0.043679412454366684, "rewards/rejected": -0.25011545419692993, "step": 140 }, { "epoch": 0.36, "learning_rate": 4.039889056019159e-06, "logits/chosen": -0.12041902542114258, "logits/rejected": -0.11129270493984222, "logps/chosen": -620.213623046875, "logps/rejected": -721.9036865234375, "loss": 0.3013, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.22644488513469696, "rewards/margins": 0.11097334325313568, "rewards/rejected": -0.33741819858551025, "step": 150 }, { "epoch": 0.38, "learning_rate": 3.869220434746509e-06, "logits/chosen": -0.14005795121192932, "logits/rejected": -0.01383652538061142, "logps/chosen": -662.5046997070312, "logps/rejected": -747.22216796875, "loss": 0.3004, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.25904232263565063, "rewards/margins": 0.11477211862802505, "rewards/rejected": -0.37381449341773987, "step": 160 }, { "epoch": 0.41, "learning_rate": 3.688896318678322e-06, "logits/chosen": -0.07857229560613632, "logits/rejected": -0.03764970228075981, "logps/chosen": -703.3973388671875, "logps/rejected": -799.6316528320312, "loss": 0.3087, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2659061551094055, "rewards/margins": 0.12018003314733505, "rewards/rejected": -0.3860861659049988, "step": 170 }, { "epoch": 0.43, "learning_rate": 3.5001883208580668e-06, "logits/chosen": -0.05302947014570236, "logits/rejected": 0.007934654131531715, "logps/chosen": -582.2310791015625, "logps/rejected": -629.2650146484375, "loss": 0.3013, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.20808251202106476, "rewards/margins": 0.07596954703330994, "rewards/rejected": -0.2840520739555359, "step": 180 }, { "epoch": 0.46, "learning_rate": 3.30442717594657e-06, "logits/chosen": -0.05860992521047592, "logits/rejected": -0.06699595600366592, "logps/chosen": -618.3209228515625, "logps/rejected": -625.3072509765625, "loss": 0.3132, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.22216761112213135, "rewards/margins": 0.06572258472442627, "rewards/rejected": -0.2878901958465576, "step": 190 }, { "epoch": 0.48, "learning_rate": 3.102993356121938e-06, "logits/chosen": -0.09036664664745331, "logits/rejected": -0.01472189836204052, "logps/chosen": -657.5107421875, "logps/rejected": -765.4071044921875, "loss": 0.3038, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.23338063061237335, "rewards/margins": 0.11495205014944077, "rewards/rejected": -0.3483327031135559, "step": 200 }, { "epoch": 0.5, "learning_rate": 2.8973073362395e-06, "logits/chosen": -0.09646060317754745, "logits/rejected": -0.004971938673406839, "logps/chosen": -603.1878662109375, "logps/rejected": -680.6282348632812, "loss": 0.2907, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.19503220915794373, "rewards/margins": 0.09983725845813751, "rewards/rejected": -0.29486945271492004, "step": 210 }, { "epoch": 0.53, "learning_rate": 2.6888195769001147e-06, "logits/chosen": -0.01057637669146061, "logits/rejected": 0.06593258678913116, "logps/chosen": -650.5782470703125, "logps/rejected": -674.4195556640625, "loss": 0.2947, "rewards/accuracies": 0.625, "rewards/chosen": -0.2231414020061493, "rewards/margins": 0.0735437422990799, "rewards/rejected": -0.2966851592063904, "step": 220 }, { "epoch": 0.55, "learning_rate": 2.479000296064417e-06, "logits/chosen": -0.028896484524011612, "logits/rejected": -0.036067038774490356, "logps/chosen": -658.7711791992188, "logps/rejected": -789.0046997070312, "loss": 0.283, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.23964250087738037, "rewards/margins": 0.11741713434457779, "rewards/rejected": -0.35705965757369995, "step": 230 }, { "epoch": 0.58, "learning_rate": 2.269329101341745e-06, "logits/chosen": -0.06356461346149445, "logits/rejected": -0.026502350345253944, "logps/chosen": -637.4093017578125, "logps/rejected": -754.2940673828125, "loss": 0.2816, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.22695472836494446, "rewards/margins": 0.11971505731344223, "rewards/rejected": -0.3466697931289673, "step": 240 }, { "epoch": 0.6, "learning_rate": 2.06128455606496e-06, "logits/chosen": -0.08697017282247543, "logits/rejected": -0.03490353375673294, "logps/chosen": -656.5325317382812, "logps/rejected": -807.2811279296875, "loss": 0.2639, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2592005729675293, "rewards/margins": 0.15677297115325928, "rewards/rejected": -0.4159735143184662, "step": 250 }, { "epoch": 0.62, "learning_rate": 1.856333752729311e-06, "logits/chosen": -0.03594154864549637, "logits/rejected": -0.01673516258597374, "logps/chosen": -696.881591796875, "logps/rejected": -812.76611328125, "loss": 0.2847, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.2547699213027954, "rewards/margins": 0.1416255235671997, "rewards/rejected": -0.3963954448699951, "step": 260 }, { "epoch": 0.65, "learning_rate": 1.6559219673215784e-06, "logits/chosen": -0.04890661686658859, "logits/rejected": 0.02824893593788147, "logps/chosen": -692.2225341796875, "logps/rejected": -814.9953002929688, "loss": 0.2867, "rewards/accuracies": 0.71875, "rewards/chosen": -0.24843928217887878, "rewards/margins": 0.14480216801166534, "rewards/rejected": -0.39324143528938293, "step": 270 }, { "epoch": 0.67, "learning_rate": 1.4614624674952843e-06, "logits/chosen": -0.05741821601986885, "logits/rejected": 0.005704033188521862, "logps/chosen": -662.9461669921875, "logps/rejected": -776.4640502929688, "loss": 0.2917, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2589377760887146, "rewards/margins": 0.12795338034629822, "rewards/rejected": -0.38689112663269043, "step": 280 }, { "epoch": 0.7, "learning_rate": 1.2743265464628787e-06, "logits/chosen": -0.09478811919689178, "logits/rejected": -0.07217035442590714, "logps/chosen": -645.3310546875, "logps/rejected": -791.1564331054688, "loss": 0.3032, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2641456425189972, "rewards/margins": 0.1409159153699875, "rewards/rejected": -0.4050615727901459, "step": 290 }, { "epoch": 0.72, "learning_rate": 1.0958338528840893e-06, "logits/chosen": -0.09124918282032013, "logits/rejected": -0.03150275722146034, "logps/chosen": -711.1171875, "logps/rejected": -811.9651489257812, "loss": 0.2986, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2826155722141266, "rewards/margins": 0.11060988903045654, "rewards/rejected": -0.39322543144226074, "step": 300 }, { "epoch": 0.74, "learning_rate": 9.272430849423175e-07, "logits/chosen": -0.09305734932422638, "logits/rejected": -0.018754666671156883, "logps/chosen": -691.7221069335938, "logps/rejected": -850.1953125, "loss": 0.2714, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2833631932735443, "rewards/margins": 0.1422998309135437, "rewards/rejected": -0.4256630539894104, "step": 310 }, { "epoch": 0.77, "learning_rate": 7.697431142327633e-07, "logits/chosen": -0.08169344067573547, "logits/rejected": -0.03381634131073952, "logps/chosen": -713.6185302734375, "logps/rejected": -803.0232543945312, "loss": 0.3013, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.29118460416793823, "rewards/margins": 0.13159199059009552, "rewards/rejected": -0.42277655005455017, "step": 320 }, { "epoch": 0.79, "learning_rate": 6.244446020550182e-07, "logits/chosen": -0.08793069422245026, "logits/rejected": -0.0125564681366086, "logps/chosen": -731.2192993164062, "logps/rejected": -855.193359375, "loss": 0.2825, "rewards/accuracies": 0.75, "rewards/chosen": -0.27191871404647827, "rewards/margins": 0.169790118932724, "rewards/rejected": -0.4417088031768799, "step": 330 }, { "epoch": 0.82, "learning_rate": 4.923721672305148e-07, "logits/chosen": -0.044528841972351074, "logits/rejected": -0.047515399754047394, "logps/chosen": -716.775390625, "logps/rejected": -822.0406494140625, "loss": 0.2816, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2675023078918457, "rewards/margins": 0.13316968083381653, "rewards/rejected": -0.40067195892333984, "step": 340 }, { "epoch": 0.84, "learning_rate": 3.7445716067596506e-07, "logits/chosen": -0.07752005755901337, "logits/rejected": -0.0024734348990023136, "logps/chosen": -684.0347290039062, "logps/rejected": -747.9011840820312, "loss": 0.2932, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.27346071600914, "rewards/margins": 0.09785878658294678, "rewards/rejected": -0.3713195323944092, "step": 350 }, { "epoch": 0.86, "learning_rate": 2.7153109768518926e-07, "logits/chosen": -0.10462590306997299, "logits/rejected": -0.05941639468073845, "logps/chosen": -697.7039794921875, "logps/rejected": -795.1232299804688, "loss": 0.2713, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.2729036211967468, "rewards/margins": 0.14049656689167023, "rewards/rejected": -0.41340017318725586, "step": 360 }, { "epoch": 0.89, "learning_rate": 1.8431979423369607e-07, "logits/chosen": -0.0763811320066452, "logits/rejected": -0.008664283901453018, "logps/chosen": -696.4910888671875, "logps/rejected": -803.7508544921875, "loss": 0.2996, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.28233736753463745, "rewards/margins": 0.11500970274209976, "rewards/rejected": -0.397347092628479, "step": 370 }, { "epoch": 0.91, "learning_rate": 1.1343824865573422e-07, "logits/chosen": -0.04591577127575874, "logits/rejected": -0.00904687587171793, "logps/chosen": -658.9864501953125, "logps/rejected": -750.4421997070312, "loss": 0.276, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2625121474266052, "rewards/margins": 0.12891098856925964, "rewards/rejected": -0.39142316579818726, "step": 380 }, { "epoch": 0.94, "learning_rate": 5.9386304787299175e-08, "logits/chosen": -0.09347660839557648, "logits/rejected": -0.061296842992305756, "logps/chosen": -699.7623291015625, "logps/rejected": -830.4713134765625, "loss": 0.292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2921612858772278, "rewards/margins": 0.13317355513572693, "rewards/rejected": -0.4253348410129547, "step": 390 }, { "epoch": 0.96, "learning_rate": 2.2545127157831416e-08, "logits/chosen": -0.15901389718055725, "logits/rejected": -0.05499373748898506, "logps/chosen": -679.7059326171875, "logps/rejected": -841.7716064453125, "loss": 0.2704, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2723065912723541, "rewards/margins": 0.15460878610610962, "rewards/rejected": -0.42691534757614136, "step": 400 }, { "epoch": 0.98, "learning_rate": 3.1745130869123564e-09, "logits/chosen": -0.0728246420621872, "logits/rejected": -0.02904803678393364, "logps/chosen": -687.2608642578125, "logps/rejected": -801.3157958984375, "loss": 0.2735, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.26681455969810486, "rewards/margins": 0.1451863944530487, "rewards/rejected": -0.41200098395347595, "step": 410 }, { "epoch": 1.0, "step": 416, "total_flos": 0.0, "train_loss": 0.30559141518404853, "train_runtime": 5582.2148, "train_samples_per_second": 3.583, "train_steps_per_second": 0.075 } ], "logging_steps": 10, "max_steps": 416, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }