{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03685447090800203, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 28.375805268212044, "learning_rate": 2.439024390243903e-07, "loss": 1.9916, "step": 1 }, { "epoch": 0.0, "grad_norm": 35.06631979328207, "learning_rate": 4.878048780487805e-07, "loss": 2.0906, "step": 2 }, { "epoch": 0.0, "grad_norm": 38.398944345049074, "learning_rate": 7.317073170731707e-07, "loss": 2.1697, "step": 3 }, { "epoch": 0.0, "grad_norm": 31.786491229547707, "learning_rate": 9.75609756097561e-07, "loss": 1.885, "step": 4 }, { "epoch": 0.0, "grad_norm": 36.04697641354884, "learning_rate": 1.2195121951219514e-06, "loss": 2.055, "step": 5 }, { "epoch": 0.0, "grad_norm": 34.00945696139665, "learning_rate": 1.4634146341463414e-06, "loss": 1.9784, "step": 6 }, { "epoch": 0.0, "grad_norm": 26.542180094892565, "learning_rate": 1.707317073170732e-06, "loss": 1.9204, "step": 7 }, { "epoch": 0.0, "grad_norm": 31.922434847983876, "learning_rate": 1.951219512195122e-06, "loss": 1.9556, "step": 8 }, { "epoch": 0.0, "grad_norm": 29.799359182326604, "learning_rate": 2.1951219512195125e-06, "loss": 2.0238, "step": 9 }, { "epoch": 0.0, "grad_norm": 24.788943060794754, "learning_rate": 2.4390243902439027e-06, "loss": 2.0543, "step": 10 }, { "epoch": 0.0, "grad_norm": 23.62857327313063, "learning_rate": 2.682926829268293e-06, "loss": 1.974, "step": 11 }, { "epoch": 0.0, "grad_norm": 25.440796117241586, "learning_rate": 2.926829268292683e-06, "loss": 1.9558, "step": 12 }, { "epoch": 0.0, "grad_norm": 11.507873994835268, "learning_rate": 3.1707317073170736e-06, "loss": 1.554, "step": 13 }, { "epoch": 0.01, "grad_norm": 10.11918265638141, "learning_rate": 3.414634146341464e-06, "loss": 1.5119, "step": 14 }, { "epoch": 0.01, "grad_norm": 12.356401968625594, "learning_rate": 3.6585365853658537e-06, "loss": 1.6321, "step": 15 }, { "epoch": 0.01, "grad_norm": 11.132578216935597, "learning_rate": 3.902439024390244e-06, "loss": 1.5888, "step": 16 }, { "epoch": 0.01, "grad_norm": 12.240345185091753, "learning_rate": 4.146341463414634e-06, "loss": 1.5076, "step": 17 }, { "epoch": 0.01, "grad_norm": 11.968138590460152, "learning_rate": 4.390243902439025e-06, "loss": 1.3815, "step": 18 }, { "epoch": 0.01, "grad_norm": 8.169350991545556, "learning_rate": 4.634146341463416e-06, "loss": 1.3101, "step": 19 }, { "epoch": 0.01, "grad_norm": 9.505811804013986, "learning_rate": 4.8780487804878055e-06, "loss": 1.2818, "step": 20 }, { "epoch": 0.01, "grad_norm": 5.796640993993368, "learning_rate": 5.121951219512195e-06, "loss": 1.2677, "step": 21 }, { "epoch": 0.01, "grad_norm": 4.764801673099041, "learning_rate": 5.365853658536586e-06, "loss": 1.2417, "step": 22 }, { "epoch": 0.01, "grad_norm": 3.8410501167446562, "learning_rate": 5.609756097560977e-06, "loss": 1.2282, "step": 23 }, { "epoch": 0.01, "grad_norm": 3.2269719491020132, "learning_rate": 5.853658536585366e-06, "loss": 1.1735, "step": 24 }, { "epoch": 0.01, "grad_norm": 2.7362981827784276, "learning_rate": 6.0975609756097564e-06, "loss": 1.1771, "step": 25 }, { "epoch": 0.01, "grad_norm": 2.3380480940076693, "learning_rate": 6.341463414634147e-06, "loss": 1.1862, "step": 26 }, { "epoch": 0.01, "grad_norm": 2.3527882605489214, "learning_rate": 6.585365853658538e-06, "loss": 1.0824, "step": 27 }, { "epoch": 0.01, "grad_norm": 2.120763742750798, "learning_rate": 6.829268292682928e-06, "loss": 1.0907, "step": 28 }, { "epoch": 0.01, "grad_norm": 1.7082373070011942, "learning_rate": 7.0731707317073175e-06, "loss": 1.0936, "step": 29 }, { "epoch": 0.01, "grad_norm": 1.771684587866832, "learning_rate": 7.317073170731707e-06, "loss": 1.232, "step": 30 }, { "epoch": 0.01, "grad_norm": 1.6792419401200112, "learning_rate": 7.560975609756098e-06, "loss": 1.0939, "step": 31 }, { "epoch": 0.01, "grad_norm": 1.5870734246620417, "learning_rate": 7.804878048780489e-06, "loss": 1.0344, "step": 32 }, { "epoch": 0.01, "grad_norm": 1.5321693507507685, "learning_rate": 8.048780487804879e-06, "loss": 1.1247, "step": 33 }, { "epoch": 0.01, "grad_norm": 1.5384110247679068, "learning_rate": 8.292682926829268e-06, "loss": 1.0561, "step": 34 }, { "epoch": 0.01, "grad_norm": 1.6803740230388178, "learning_rate": 8.536585365853658e-06, "loss": 1.0912, "step": 35 }, { "epoch": 0.01, "grad_norm": 1.7687493200257336, "learning_rate": 8.78048780487805e-06, "loss": 1.0656, "step": 36 }, { "epoch": 0.01, "grad_norm": 1.696255950031143, "learning_rate": 9.02439024390244e-06, "loss": 1.0381, "step": 37 }, { "epoch": 0.01, "grad_norm": 1.5093882845959572, "learning_rate": 9.268292682926831e-06, "loss": 1.1241, "step": 38 }, { "epoch": 0.01, "grad_norm": 1.6975518192922392, "learning_rate": 9.51219512195122e-06, "loss": 0.9997, "step": 39 }, { "epoch": 0.01, "grad_norm": 1.5343961293035455, "learning_rate": 9.756097560975611e-06, "loss": 1.0369, "step": 40 }, { "epoch": 0.02, "grad_norm": 1.4138966954537764, "learning_rate": 1e-05, "loss": 1.014, "step": 41 }, { "epoch": 0.02, "grad_norm": 1.176007866432356, "learning_rate": 1.024390243902439e-05, "loss": 1.0021, "step": 42 }, { "epoch": 0.02, "grad_norm": 1.2381411186803413, "learning_rate": 1.0487804878048782e-05, "loss": 1.1223, "step": 43 }, { "epoch": 0.02, "grad_norm": 1.320371206343042, "learning_rate": 1.0731707317073172e-05, "loss": 1.1278, "step": 44 }, { "epoch": 0.02, "grad_norm": 1.169780930584551, "learning_rate": 1.0975609756097562e-05, "loss": 0.9749, "step": 45 }, { "epoch": 0.02, "grad_norm": 1.2215812178026708, "learning_rate": 1.1219512195121953e-05, "loss": 1.1331, "step": 46 }, { "epoch": 0.02, "grad_norm": 1.2706297862727212, "learning_rate": 1.1463414634146342e-05, "loss": 1.0683, "step": 47 }, { "epoch": 0.02, "grad_norm": 1.1432329777320256, "learning_rate": 1.1707317073170731e-05, "loss": 1.1807, "step": 48 }, { "epoch": 0.02, "grad_norm": 1.2331341664649251, "learning_rate": 1.1951219512195123e-05, "loss": 1.1285, "step": 49 }, { "epoch": 0.02, "grad_norm": 1.1741530257255526, "learning_rate": 1.2195121951219513e-05, "loss": 1.0448, "step": 50 }, { "epoch": 0.02, "grad_norm": 1.1161714391272533, "learning_rate": 1.2439024390243903e-05, "loss": 1.0628, "step": 51 }, { "epoch": 0.02, "grad_norm": 1.1971235536055034, "learning_rate": 1.2682926829268294e-05, "loss": 1.0049, "step": 52 }, { "epoch": 0.02, "grad_norm": 1.045851128188208, "learning_rate": 1.2926829268292684e-05, "loss": 1.1882, "step": 53 }, { "epoch": 0.02, "grad_norm": 1.0698067786575107, "learning_rate": 1.3170731707317076e-05, "loss": 0.9463, "step": 54 }, { "epoch": 0.02, "grad_norm": 1.172591425775864, "learning_rate": 1.3414634146341466e-05, "loss": 1.0636, "step": 55 }, { "epoch": 0.02, "grad_norm": 1.1106016799757537, "learning_rate": 1.3658536585365855e-05, "loss": 0.9946, "step": 56 }, { "epoch": 0.02, "grad_norm": 1.086675627063963, "learning_rate": 1.3902439024390244e-05, "loss": 1.1333, "step": 57 }, { "epoch": 0.02, "grad_norm": 1.1240451271188707, "learning_rate": 1.4146341463414635e-05, "loss": 1.0376, "step": 58 }, { "epoch": 0.02, "grad_norm": 1.0978604882977046, "learning_rate": 1.4390243902439025e-05, "loss": 1.0286, "step": 59 }, { "epoch": 0.02, "grad_norm": 1.1371703935435598, "learning_rate": 1.4634146341463415e-05, "loss": 1.0489, "step": 60 }, { "epoch": 0.02, "grad_norm": 1.081855866398565, "learning_rate": 1.4878048780487806e-05, "loss": 1.0347, "step": 61 }, { "epoch": 0.02, "grad_norm": 1.1018052800440352, "learning_rate": 1.5121951219512196e-05, "loss": 1.0503, "step": 62 }, { "epoch": 0.02, "grad_norm": 1.2132015921868264, "learning_rate": 1.5365853658536586e-05, "loss": 0.962, "step": 63 }, { "epoch": 0.02, "grad_norm": 1.1035822755607994, "learning_rate": 1.5609756097560978e-05, "loss": 1.0749, "step": 64 }, { "epoch": 0.02, "grad_norm": 1.0944024225848128, "learning_rate": 1.585365853658537e-05, "loss": 1.0126, "step": 65 }, { "epoch": 0.02, "grad_norm": 1.1408025250647078, "learning_rate": 1.6097560975609757e-05, "loss": 0.9841, "step": 66 }, { "epoch": 0.02, "grad_norm": 1.0549980708871876, "learning_rate": 1.6341463414634145e-05, "loss": 1.0857, "step": 67 }, { "epoch": 0.03, "grad_norm": 1.1088465780550794, "learning_rate": 1.6585365853658537e-05, "loss": 1.0189, "step": 68 }, { "epoch": 0.03, "grad_norm": 1.1030941169394408, "learning_rate": 1.682926829268293e-05, "loss": 0.9477, "step": 69 }, { "epoch": 0.03, "grad_norm": 1.0906659313859568, "learning_rate": 1.7073170731707317e-05, "loss": 1.1326, "step": 70 }, { "epoch": 0.03, "grad_norm": 1.095023300471396, "learning_rate": 1.7317073170731708e-05, "loss": 1.0073, "step": 71 }, { "epoch": 0.03, "grad_norm": 1.0603304396820843, "learning_rate": 1.75609756097561e-05, "loss": 1.0442, "step": 72 }, { "epoch": 0.03, "grad_norm": 1.0658804001104982, "learning_rate": 1.7804878048780488e-05, "loss": 1.0266, "step": 73 }, { "epoch": 0.03, "grad_norm": 1.0581434225632516, "learning_rate": 1.804878048780488e-05, "loss": 0.9944, "step": 74 }, { "epoch": 0.03, "grad_norm": 1.0325391690636108, "learning_rate": 1.829268292682927e-05, "loss": 0.9795, "step": 75 }, { "epoch": 0.03, "grad_norm": 1.0137734048869766, "learning_rate": 1.8536585365853663e-05, "loss": 0.9792, "step": 76 }, { "epoch": 0.03, "grad_norm": 1.0835263870224308, "learning_rate": 1.878048780487805e-05, "loss": 1.0396, "step": 77 }, { "epoch": 0.03, "grad_norm": 0.9724143527075428, "learning_rate": 1.902439024390244e-05, "loss": 0.9221, "step": 78 }, { "epoch": 0.03, "grad_norm": 0.9990983369485978, "learning_rate": 1.926829268292683e-05, "loss": 0.9228, "step": 79 }, { "epoch": 0.03, "grad_norm": 1.1063583109588462, "learning_rate": 1.9512195121951222e-05, "loss": 1.0854, "step": 80 }, { "epoch": 0.03, "grad_norm": 1.0512858412634352, "learning_rate": 1.975609756097561e-05, "loss": 1.0148, "step": 81 }, { "epoch": 0.03, "grad_norm": 1.0536440636090694, "learning_rate": 2e-05, "loss": 1.0287, "step": 82 }, { "epoch": 0.03, "grad_norm": 1.0803096330003004, "learning_rate": 1.999999287101006e-05, "loss": 1.1047, "step": 83 }, { "epoch": 0.03, "grad_norm": 1.0547983394060736, "learning_rate": 1.99999714840504e-05, "loss": 0.9816, "step": 84 }, { "epoch": 0.03, "grad_norm": 0.9807015967414549, "learning_rate": 1.9999935839151513e-05, "loss": 0.9548, "step": 85 }, { "epoch": 0.03, "grad_norm": 1.0711912458397403, "learning_rate": 1.999988593636423e-05, "loss": 0.9682, "step": 86 }, { "epoch": 0.03, "grad_norm": 0.9722241890066263, "learning_rate": 1.999982177575969e-05, "loss": 1.0405, "step": 87 }, { "epoch": 0.03, "grad_norm": 1.028798727103559, "learning_rate": 1.999974335742938e-05, "loss": 1.1811, "step": 88 }, { "epoch": 0.03, "grad_norm": 1.1021829611351164, "learning_rate": 1.999965068148511e-05, "loss": 0.9051, "step": 89 }, { "epoch": 0.03, "grad_norm": 1.0334726008966082, "learning_rate": 1.9999543748059012e-05, "loss": 1.0484, "step": 90 }, { "epoch": 0.03, "grad_norm": 1.066221360284238, "learning_rate": 1.9999422557303553e-05, "loss": 1.0307, "step": 91 }, { "epoch": 0.03, "grad_norm": 1.054488043267414, "learning_rate": 1.999928710939153e-05, "loss": 1.1487, "step": 92 }, { "epoch": 0.03, "grad_norm": 1.1079480240688568, "learning_rate": 1.9999137404516062e-05, "loss": 0.9761, "step": 93 }, { "epoch": 0.03, "grad_norm": 1.070030561141793, "learning_rate": 1.99989734428906e-05, "loss": 1.0988, "step": 94 }, { "epoch": 0.04, "grad_norm": 1.0564078940337887, "learning_rate": 1.9998795224748916e-05, "loss": 1.0195, "step": 95 }, { "epoch": 0.04, "grad_norm": 1.1531641916007334, "learning_rate": 1.9998602750345113e-05, "loss": 1.0807, "step": 96 }, { "epoch": 0.04, "grad_norm": 1.116837722786228, "learning_rate": 1.9998396019953627e-05, "loss": 0.9621, "step": 97 }, { "epoch": 0.04, "grad_norm": 1.0677298051233879, "learning_rate": 1.9998175033869205e-05, "loss": 0.9151, "step": 98 }, { "epoch": 0.04, "grad_norm": 1.1242469864164173, "learning_rate": 1.9997939792406936e-05, "loss": 0.9819, "step": 99 }, { "epoch": 0.04, "grad_norm": 1.16872009332317, "learning_rate": 1.9997690295902225e-05, "loss": 1.1617, "step": 100 } ], "logging_steps": 1.0, "max_steps": 2713, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 134491228471296.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }