{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9473684210526314, "eval_steps": 10, "global_step": 21, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14035087719298245, "grad_norm": 36.83965274682509, "learning_rate": 3.3333333333333333e-06, "logits/chosen": -0.6751565337181091, "logits/rejected": -0.680110514163971, "logps/chosen": -52.487876892089844, "logps/rejected": -58.423255920410156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.2807017543859649, "grad_norm": 36.26353717517849, "learning_rate": 6.666666666666667e-06, "logits/chosen": -0.7261925339698792, "logits/rejected": -0.7052676677703857, "logps/chosen": -51.76400375366211, "logps/rejected": -58.045860290527344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.42105263157894735, "grad_norm": 6.35145459559084, "learning_rate": 1e-05, "logits/chosen": -0.6268625259399414, "logits/rejected": -0.44894033670425415, "logps/chosen": -42.51791763305664, "logps/rejected": -77.13123321533203, "loss": 0.1248, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8370370864868164, "rewards/margins": 2.879814386367798, "rewards/rejected": -2.0427772998809814, "step": 3 }, { "epoch": 0.5614035087719298, "grad_norm": 6.045707313215622, "learning_rate": 9.924038765061042e-06, "logits/chosen": -0.49778643250465393, "logits/rejected": -0.3794001340866089, "logps/chosen": -40.093902587890625, "logps/rejected": -100.6180191040039, "loss": 0.0342, "rewards/accuracies": 1.0, "rewards/chosen": 1.0513410568237305, "rewards/margins": 5.476510047912598, "rewards/rejected": -4.425168514251709, "step": 4 }, { "epoch": 0.7017543859649122, "grad_norm": 3.5732460989636903, "learning_rate": 9.698463103929542e-06, "logits/chosen": -0.1468430608510971, "logits/rejected": -0.014196997508406639, "logps/chosen": -47.64195251464844, "logps/rejected": -108.50719451904297, "loss": 0.035, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4784576892852783, "rewards/margins": 5.548664093017578, "rewards/rejected": -5.070206165313721, "step": 5 }, { "epoch": 0.8421052631578947, "grad_norm": 5.73507612352672, "learning_rate": 9.330127018922195e-06, "logits/chosen": -0.45869290828704834, "logits/rejected": -0.35349956154823303, "logps/chosen": -23.860214233398438, "logps/rejected": -72.7856216430664, "loss": 0.0381, "rewards/accuracies": 0.984375, "rewards/chosen": 2.7691407203674316, "rewards/margins": 4.281482696533203, "rewards/rejected": -1.512341856956482, "step": 6 }, { "epoch": 0.9824561403508771, "grad_norm": 0.20133266492379556, "learning_rate": 8.83022221559489e-06, "logits/chosen": 0.02529796212911606, "logits/rejected": -0.027504097670316696, "logps/chosen": -20.159576416015625, "logps/rejected": -136.86343383789062, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 3.0319323539733887, "rewards/margins": 10.98936653137207, "rewards/rejected": -7.957433700561523, "step": 7 }, { "epoch": 1.1228070175438596, "grad_norm": 1.538785447893909, "learning_rate": 8.213938048432697e-06, "logits/chosen": -0.09107446670532227, "logits/rejected": -0.030364712700247765, "logps/chosen": -31.587915420532227, "logps/rejected": -148.98675537109375, "loss": 0.0988, "rewards/accuracies": 0.984375, "rewards/chosen": 2.1359970569610596, "rewards/margins": 11.210424423217773, "rewards/rejected": -9.074427604675293, "step": 8 }, { "epoch": 1.263157894736842, "grad_norm": 2.484440900292177, "learning_rate": 7.500000000000001e-06, "logits/chosen": -0.021966181695461273, "logits/rejected": -0.035571545362472534, "logps/chosen": -22.42191505432129, "logps/rejected": -139.77471923828125, "loss": 0.0482, "rewards/accuracies": 1.0, "rewards/chosen": 2.9303808212280273, "rewards/margins": 11.172625541687012, "rewards/rejected": -8.242246627807617, "step": 9 }, { "epoch": 1.4035087719298245, "grad_norm": 2.063248053979488, "learning_rate": 6.710100716628345e-06, "logits/chosen": -0.1402013897895813, "logits/rejected": -0.09476425498723984, "logps/chosen": -22.920854568481445, "logps/rejected": -139.89112854003906, "loss": 0.1011, "rewards/accuracies": 0.984375, "rewards/chosen": 2.8295440673828125, "rewards/margins": 11.169378280639648, "rewards/rejected": -8.339835166931152, "step": 10 }, { "epoch": 1.4035087719298245, "eval_logits/chosen": -0.5006358027458191, "eval_logits/rejected": -0.5087898373603821, "eval_logps/chosen": -21.234792709350586, "eval_logps/rejected": -123.04338073730469, "eval_loss": 0.07855287194252014, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 3.130305767059326, "eval_rewards/margins": 9.576081275939941, "eval_rewards/rejected": -6.445775032043457, "eval_runtime": 16.4971, "eval_samples_per_second": 6.062, "eval_steps_per_second": 3.031, "step": 10 }, { "epoch": 1.543859649122807, "grad_norm": 2.01867368401983, "learning_rate": 5.8682408883346535e-06, "logits/chosen": -0.3766348659992218, "logits/rejected": -0.22392578423023224, "logps/chosen": -23.370777130126953, "logps/rejected": -121.3629150390625, "loss": 0.0607, "rewards/accuracies": 0.984375, "rewards/chosen": 2.7753195762634277, "rewards/margins": 9.217280387878418, "rewards/rejected": -6.441961765289307, "step": 11 }, { "epoch": 1.6842105263157894, "grad_norm": 4.079342276288671, "learning_rate": 5e-06, "logits/chosen": -0.5493069887161255, "logits/rejected": -0.3390986919403076, "logps/chosen": -26.047971725463867, "logps/rejected": -90.44429016113281, "loss": 0.0339, "rewards/accuracies": 0.984375, "rewards/chosen": 2.553872585296631, "rewards/margins": 5.881150722503662, "rewards/rejected": -3.3272786140441895, "step": 12 }, { "epoch": 1.8245614035087718, "grad_norm": 1.7556159148296524, "learning_rate": 4.131759111665349e-06, "logits/chosen": -0.4146791398525238, "logits/rejected": -0.286813884973526, "logps/chosen": -24.490224838256836, "logps/rejected": -102.24156188964844, "loss": 0.0562, "rewards/accuracies": 0.984375, "rewards/chosen": 2.6847214698791504, "rewards/margins": 7.214890003204346, "rewards/rejected": -4.5301690101623535, "step": 13 }, { "epoch": 1.9649122807017543, "grad_norm": 1.5851709952279243, "learning_rate": 3.289899283371657e-06, "logits/chosen": -0.44919806718826294, "logits/rejected": -0.38574808835983276, "logps/chosen": -21.76868438720703, "logps/rejected": -93.4135513305664, "loss": 0.0404, "rewards/accuracies": 0.984375, "rewards/chosen": 2.8682363033294678, "rewards/margins": 6.415185928344727, "rewards/rejected": -3.5469493865966797, "step": 14 }, { "epoch": 2.1052631578947367, "grad_norm": 0.8663363353210317, "learning_rate": 2.5000000000000015e-06, "logits/chosen": -0.3753425180912018, "logits/rejected": -0.3148278295993805, "logps/chosen": -27.85059928894043, "logps/rejected": -96.494140625, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": 2.4423599243164062, "rewards/margins": 6.41550874710083, "rewards/rejected": -3.9731483459472656, "step": 15 }, { "epoch": 2.245614035087719, "grad_norm": 0.46918563058013124, "learning_rate": 1.7860619515673034e-06, "logits/chosen": -0.37954726815223694, "logits/rejected": -0.3316181004047394, "logps/chosen": -25.504596710205078, "logps/rejected": -102.33647155761719, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": 2.5688281059265137, "rewards/margins": 7.0828657150268555, "rewards/rejected": -4.514037132263184, "step": 16 }, { "epoch": 2.3859649122807016, "grad_norm": 1.3915012816414571, "learning_rate": 1.1697777844051105e-06, "logits/chosen": -0.32976603507995605, "logits/rejected": -0.24117198586463928, "logps/chosen": -22.648927688598633, "logps/rejected": -111.1461181640625, "loss": 0.0722, "rewards/accuracies": 0.984375, "rewards/chosen": 2.808568239212036, "rewards/margins": 8.266373634338379, "rewards/rejected": -5.457805633544922, "step": 17 }, { "epoch": 2.526315789473684, "grad_norm": 0.7864294623647854, "learning_rate": 6.698729810778065e-07, "logits/chosen": -0.31580400466918945, "logits/rejected": -0.23523080348968506, "logps/chosen": -21.38547706604004, "logps/rejected": -112.5594482421875, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": 2.9571423530578613, "rewards/margins": 8.440750122070312, "rewards/rejected": -5.483607769012451, "step": 18 }, { "epoch": 2.6666666666666665, "grad_norm": 1.6842586899427825, "learning_rate": 3.015368960704584e-07, "logits/chosen": -0.227472722530365, "logits/rejected": -0.19328871369361877, "logps/chosen": -18.14565086364746, "logps/rejected": -115.78607177734375, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": 3.302818536758423, "rewards/margins": 9.093223571777344, "rewards/rejected": -5.790404796600342, "step": 19 }, { "epoch": 2.807017543859649, "grad_norm": 0.37240158397575107, "learning_rate": 7.59612349389599e-08, "logits/chosen": -0.28918278217315674, "logits/rejected": -0.20876720547676086, "logps/chosen": -20.228591918945312, "logps/rejected": -110.49909210205078, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": 3.1952502727508545, "rewards/margins": 8.55958366394043, "rewards/rejected": -5.36433219909668, "step": 20 }, { "epoch": 2.807017543859649, "eval_logits/chosen": -0.5716415643692017, "eval_logits/rejected": -0.5896289944648743, "eval_logps/chosen": -19.139745712280273, "eval_logps/rejected": -113.21296691894531, "eval_loss": 0.07458853721618652, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 3.339810371398926, "eval_rewards/margins": 8.802544593811035, "eval_rewards/rejected": -5.462734222412109, "eval_runtime": 18.3879, "eval_samples_per_second": 5.438, "eval_steps_per_second": 2.719, "step": 20 }, { "epoch": 2.9473684210526314, "grad_norm": 2.3675721766606554, "learning_rate": 0.0, "logits/chosen": -0.3103935122489929, "logits/rejected": -0.23779892921447754, "logps/chosen": -24.52212905883789, "logps/rejected": -107.23027038574219, "loss": 0.0789, "rewards/accuracies": 0.96875, "rewards/chosen": 2.7476420402526855, "rewards/margins": 7.737358093261719, "rewards/rejected": -4.989716529846191, "step": 21 } ], "logging_steps": 1, "max_steps": 21, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1322411360256.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }