{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9398359161349132, "eval_steps": 1.0, "global_step": 129, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 9.694531367351816, "learning_rate": 0.0, "loss": 0.9609, "step": 1 }, { "epoch": 0.05, "grad_norm": 6.0698578450559015, "learning_rate": 5e-06, "loss": 0.8693, "step": 2 }, { "epoch": 0.07, "grad_norm": 6.5557897846625535, "learning_rate": 7.924812503605782e-06, "loss": 0.8571, "step": 3 }, { "epoch": 0.09, "grad_norm": 5.8249565635426235, "learning_rate": 1e-05, "loss": 0.8098, "step": 4 }, { "epoch": 0.11, "grad_norm": 7.157097194857861, "learning_rate": 1e-05, "loss": 0.7421, "step": 5 }, { "epoch": 0.14, "grad_norm": 6.7490063693246185, "learning_rate": 1e-05, "loss": 0.7184, "step": 6 }, { "epoch": 0.16, "grad_norm": 6.989582963023535, "learning_rate": 1e-05, "loss": 0.7101, "step": 7 }, { "epoch": 0.18, "grad_norm": 4.856325406983797, "learning_rate": 1e-05, "loss": 0.6815, "step": 8 }, { "epoch": 0.21, "grad_norm": 4.893753189326447, "learning_rate": 1e-05, "loss": 0.6975, "step": 9 }, { "epoch": 0.23, "grad_norm": 4.8564394287744745, "learning_rate": 1e-05, "loss": 0.65, "step": 10 }, { "epoch": 0.25, "grad_norm": 4.638777805285931, "learning_rate": 1e-05, "loss": 0.5817, "step": 11 }, { "epoch": 0.27, "grad_norm": 3.701804268792858, "learning_rate": 1e-05, "loss": 0.6005, "step": 12 }, { "epoch": 0.3, "grad_norm": 3.08244950250501, "learning_rate": 1e-05, "loss": 0.5529, "step": 13 }, { "epoch": 0.32, "grad_norm": 3.210541622720776, "learning_rate": 1e-05, "loss": 0.5654, "step": 14 }, { "epoch": 0.34, "grad_norm": 3.0384959005478867, "learning_rate": 1e-05, "loss": 0.5184, "step": 15 }, { "epoch": 0.36, "grad_norm": 2.6850393439015092, "learning_rate": 1e-05, "loss": 0.5234, "step": 16 }, { "epoch": 0.39, "grad_norm": 2.8596154120661748, "learning_rate": 1e-05, "loss": 0.5005, "step": 17 }, { "epoch": 0.41, "grad_norm": 2.726391191331952, "learning_rate": 1e-05, "loss": 0.4691, "step": 18 }, { "epoch": 0.43, "grad_norm": 2.934188314387199, "learning_rate": 1e-05, "loss": 0.5265, "step": 19 }, { "epoch": 0.46, "grad_norm": 2.496046815968877, "learning_rate": 1e-05, "loss": 0.46, "step": 20 }, { "epoch": 0.48, "grad_norm": 2.6527493698931703, "learning_rate": 1e-05, "loss": 0.4633, "step": 21 }, { "epoch": 0.5, "grad_norm": 2.3371216207823364, "learning_rate": 1e-05, "loss": 0.4512, "step": 22 }, { "epoch": 0.52, "grad_norm": 2.5029789662415576, "learning_rate": 1e-05, "loss": 0.4538, "step": 23 }, { "epoch": 0.55, "grad_norm": 2.3654775885807435, "learning_rate": 1e-05, "loss": 0.4503, "step": 24 }, { "epoch": 0.57, "grad_norm": 2.520897440850751, "learning_rate": 1e-05, "loss": 0.4393, "step": 25 }, { "epoch": 0.59, "grad_norm": 2.2973148314047998, "learning_rate": 1e-05, "loss": 0.4246, "step": 26 }, { "epoch": 0.62, "grad_norm": 2.064310434148654, "learning_rate": 1e-05, "loss": 0.4466, "step": 27 }, { "epoch": 0.64, "grad_norm": 2.1518579274793614, "learning_rate": 1e-05, "loss": 0.4157, "step": 28 }, { "epoch": 0.66, "grad_norm": 2.101663348817292, "learning_rate": 1e-05, "loss": 0.4142, "step": 29 }, { "epoch": 0.68, "grad_norm": 2.0792549392682624, "learning_rate": 1e-05, "loss": 0.3814, "step": 30 }, { "epoch": 0.71, "grad_norm": 2.4317303715104868, "learning_rate": 1e-05, "loss": 0.4042, "step": 31 }, { "epoch": 0.73, "grad_norm": 2.6323607639867443, "learning_rate": 1e-05, "loss": 0.3745, "step": 32 }, { "epoch": 0.75, "grad_norm": 2.2663973989673987, "learning_rate": 1e-05, "loss": 0.3781, "step": 33 }, { "epoch": 0.77, "grad_norm": 2.2288967526596193, "learning_rate": 1e-05, "loss": 0.3894, "step": 34 }, { "epoch": 0.8, "grad_norm": 2.2598649631650223, "learning_rate": 1e-05, "loss": 0.3734, "step": 35 }, { "epoch": 0.82, "grad_norm": 2.046710491098762, "learning_rate": 1e-05, "loss": 0.3436, "step": 36 }, { "epoch": 0.84, "grad_norm": 2.104660502205773, "learning_rate": 1e-05, "loss": 0.3636, "step": 37 }, { "epoch": 0.87, "grad_norm": 2.2194274529750335, "learning_rate": 1e-05, "loss": 0.3687, "step": 38 }, { "epoch": 0.89, "grad_norm": 2.380578087514372, "learning_rate": 1e-05, "loss": 0.3386, "step": 39 }, { "epoch": 0.91, "grad_norm": 2.1539059012744675, "learning_rate": 1e-05, "loss": 0.3663, "step": 40 }, { "epoch": 0.93, "grad_norm": 2.2975008924550884, "learning_rate": 1e-05, "loss": 0.3529, "step": 41 }, { "epoch": 0.96, "grad_norm": 2.0215009917813864, "learning_rate": 1e-05, "loss": 0.3255, "step": 42 }, { "epoch": 0.98, "grad_norm": 2.260279893370542, "learning_rate": 1e-05, "loss": 0.3496, "step": 43 }, { "epoch": 1.0, "grad_norm": 2.1998031778618596, "learning_rate": 1e-05, "loss": 0.3295, "step": 44 }, { "epoch": 1.03, "grad_norm": 1.9950078809811038, "learning_rate": 1e-05, "loss": 0.2919, "step": 45 }, { "epoch": 1.05, "grad_norm": 1.8802987071836594, "learning_rate": 1e-05, "loss": 0.2774, "step": 46 }, { "epoch": 1.07, "grad_norm": 2.330358996793001, "learning_rate": 1e-05, "loss": 0.2953, "step": 47 }, { "epoch": 1.09, "grad_norm": 2.187255642034782, "learning_rate": 1e-05, "loss": 0.2637, "step": 48 }, { "epoch": 1.12, "grad_norm": 2.246935471483204, "learning_rate": 1e-05, "loss": 0.2806, "step": 49 }, { "epoch": 1.14, "grad_norm": 2.129600005729623, "learning_rate": 1e-05, "loss": 0.2566, "step": 50 }, { "epoch": 1.16, "grad_norm": 2.1613544928929347, "learning_rate": 1e-05, "loss": 0.2703, "step": 51 }, { "epoch": 1.19, "grad_norm": 2.097651271845284, "learning_rate": 1e-05, "loss": 0.2578, "step": 52 }, { "epoch": 1.21, "grad_norm": 2.1698187501885404, "learning_rate": 1e-05, "loss": 0.266, "step": 53 }, { "epoch": 1.23, "grad_norm": 2.0534602028652733, "learning_rate": 1e-05, "loss": 0.2617, "step": 54 }, { "epoch": 1.25, "grad_norm": 2.280247094380378, "learning_rate": 1e-05, "loss": 0.2725, "step": 55 }, { "epoch": 1.28, "grad_norm": 2.282981855579156, "learning_rate": 1e-05, "loss": 0.2696, "step": 56 }, { "epoch": 1.3, "grad_norm": 2.1723199295603246, "learning_rate": 1e-05, "loss": 0.2801, "step": 57 }, { "epoch": 1.32, "grad_norm": 2.1013790638047833, "learning_rate": 1e-05, "loss": 0.2462, "step": 58 }, { "epoch": 1.34, "grad_norm": 2.1486192844939187, "learning_rate": 1e-05, "loss": 0.2634, "step": 59 }, { "epoch": 1.37, "grad_norm": 2.2011921615871874, "learning_rate": 1e-05, "loss": 0.2644, "step": 60 }, { "epoch": 1.39, "grad_norm": 2.0994036223733907, "learning_rate": 1e-05, "loss": 0.2366, "step": 61 }, { "epoch": 1.41, "grad_norm": 1.8682173502881247, "learning_rate": 1e-05, "loss": 0.2369, "step": 62 }, { "epoch": 1.44, "grad_norm": 1.9469094762833548, "learning_rate": 1e-05, "loss": 0.243, "step": 63 }, { "epoch": 1.46, "grad_norm": 2.198810958517713, "learning_rate": 1e-05, "loss": 0.2538, "step": 64 }, { "epoch": 1.48, "grad_norm": 1.9614171222986219, "learning_rate": 1e-05, "loss": 0.2209, "step": 65 }, { "epoch": 1.5, "grad_norm": 2.1909792689278924, "learning_rate": 1e-05, "loss": 0.2289, "step": 66 }, { "epoch": 1.53, "grad_norm": 2.219018585854138, "learning_rate": 1e-05, "loss": 0.2438, "step": 67 }, { "epoch": 1.55, "grad_norm": 2.095328922178155, "learning_rate": 1e-05, "loss": 0.2384, "step": 68 }, { "epoch": 1.57, "grad_norm": 2.1047747885459596, "learning_rate": 1e-05, "loss": 0.2511, "step": 69 }, { "epoch": 1.6, "grad_norm": 2.217218508465867, "learning_rate": 1e-05, "loss": 0.2326, "step": 70 }, { "epoch": 1.62, "grad_norm": 2.2940614424982364, "learning_rate": 1e-05, "loss": 0.2095, "step": 71 }, { "epoch": 1.64, "grad_norm": 2.0023231563553012, "learning_rate": 1e-05, "loss": 0.2178, "step": 72 }, { "epoch": 1.66, "grad_norm": 1.8803020902826912, "learning_rate": 1e-05, "loss": 0.2299, "step": 73 }, { "epoch": 1.69, "grad_norm": 1.9874036980658476, "learning_rate": 1e-05, "loss": 0.2085, "step": 74 }, { "epoch": 1.71, "grad_norm": 1.9993267804137187, "learning_rate": 1e-05, "loss": 0.2189, "step": 75 }, { "epoch": 1.73, "grad_norm": 2.075032021080106, "learning_rate": 1e-05, "loss": 0.2122, "step": 76 }, { "epoch": 1.75, "grad_norm": 1.9908316147368204, "learning_rate": 1e-05, "loss": 0.2169, "step": 77 }, { "epoch": 1.78, "grad_norm": 1.9737029328293805, "learning_rate": 1e-05, "loss": 0.1976, "step": 78 }, { "epoch": 1.8, "grad_norm": 2.069169880586868, "learning_rate": 1e-05, "loss": 0.2193, "step": 79 }, { "epoch": 1.82, "grad_norm": 1.9730042900890021, "learning_rate": 1e-05, "loss": 0.1959, "step": 80 }, { "epoch": 1.85, "grad_norm": 1.9487990391437768, "learning_rate": 1e-05, "loss": 0.2099, "step": 81 }, { "epoch": 1.87, "grad_norm": 2.2136709023857923, "learning_rate": 1e-05, "loss": 0.2064, "step": 82 }, { "epoch": 1.89, "grad_norm": 1.867453149915136, "learning_rate": 1e-05, "loss": 0.2148, "step": 83 }, { "epoch": 1.91, "grad_norm": 2.2341488312462374, "learning_rate": 1e-05, "loss": 0.2152, "step": 84 }, { "epoch": 1.94, "grad_norm": 1.917448307480571, "learning_rate": 1e-05, "loss": 0.2087, "step": 85 }, { "epoch": 1.96, "grad_norm": 2.1010824249510938, "learning_rate": 1e-05, "loss": 0.1989, "step": 86 }, { "epoch": 1.98, "grad_norm": 1.9948658919746771, "learning_rate": 1e-05, "loss": 0.184, "step": 87 }, { "epoch": 2.01, "grad_norm": 1.7700018111142861, "learning_rate": 1e-05, "loss": 0.1755, "step": 88 }, { "epoch": 2.03, "grad_norm": 1.7706928580089987, "learning_rate": 1e-05, "loss": 0.1524, "step": 89 }, { "epoch": 2.05, "grad_norm": 1.924950952533756, "learning_rate": 1e-05, "loss": 0.1639, "step": 90 }, { "epoch": 2.07, "grad_norm": 1.8520437499999318, "learning_rate": 1e-05, "loss": 0.1628, "step": 91 }, { "epoch": 2.1, "grad_norm": 1.9282660088754877, "learning_rate": 1e-05, "loss": 0.1738, "step": 92 }, { "epoch": 2.12, "grad_norm": 2.152060500211321, "learning_rate": 1e-05, "loss": 0.1533, "step": 93 }, { "epoch": 2.14, "grad_norm": 2.1072066102911204, "learning_rate": 1e-05, "loss": 0.1699, "step": 94 }, { "epoch": 2.16, "grad_norm": 1.9056526630981285, "learning_rate": 1e-05, "loss": 0.1542, "step": 95 }, { "epoch": 2.19, "grad_norm": 2.0541561878206815, "learning_rate": 1e-05, "loss": 0.1654, "step": 96 }, { "epoch": 2.21, "grad_norm": 1.9010874038699952, "learning_rate": 1e-05, "loss": 0.1284, "step": 97 }, { "epoch": 2.23, "grad_norm": 1.8180452474072457, "learning_rate": 1e-05, "loss": 0.1294, "step": 98 }, { "epoch": 2.26, "grad_norm": 1.9510507921351066, "learning_rate": 1e-05, "loss": 0.1555, "step": 99 }, { "epoch": 2.28, "grad_norm": 2.16385305667214, "learning_rate": 1e-05, "loss": 0.1568, "step": 100 }, { "epoch": 2.3, "grad_norm": 1.778578736374391, "learning_rate": 1e-05, "loss": 0.1493, "step": 101 }, { "epoch": 2.32, "grad_norm": 1.7020886640236346, "learning_rate": 1e-05, "loss": 0.1365, "step": 102 }, { "epoch": 2.35, "grad_norm": 1.9239197373726553, "learning_rate": 1e-05, "loss": 0.1529, "step": 103 }, { "epoch": 2.37, "grad_norm": 1.8300125885434804, "learning_rate": 1e-05, "loss": 0.1507, "step": 104 }, { "epoch": 2.39, "grad_norm": 1.7740533757030397, "learning_rate": 1e-05, "loss": 0.1404, "step": 105 }, { "epoch": 2.42, "grad_norm": 1.921607088125746, "learning_rate": 1e-05, "loss": 0.1403, "step": 106 }, { "epoch": 2.44, "grad_norm": 1.7244219031686432, "learning_rate": 1e-05, "loss": 0.1493, "step": 107 }, { "epoch": 2.46, "grad_norm": 1.840637334327782, "learning_rate": 1e-05, "loss": 0.1555, "step": 108 }, { "epoch": 2.48, "grad_norm": 1.7138834314865232, "learning_rate": 1e-05, "loss": 0.1507, "step": 109 }, { "epoch": 2.51, "grad_norm": 1.8869772327906467, "learning_rate": 1e-05, "loss": 0.1519, "step": 110 }, { "epoch": 2.53, "grad_norm": 1.722031710475424, "learning_rate": 1e-05, "loss": 0.1388, "step": 111 }, { "epoch": 2.55, "grad_norm": 1.8294129933303556, "learning_rate": 1e-05, "loss": 0.1429, "step": 112 }, { "epoch": 2.58, "grad_norm": 1.913994003850658, "learning_rate": 1e-05, "loss": 0.1391, "step": 113 }, { "epoch": 2.6, "grad_norm": 1.9673824351683147, "learning_rate": 1e-05, "loss": 0.1374, "step": 114 }, { "epoch": 2.62, "grad_norm": 1.9673399496447597, "learning_rate": 1e-05, "loss": 0.1458, "step": 115 }, { "epoch": 2.64, "grad_norm": 1.8027392959029291, "learning_rate": 1e-05, "loss": 0.1351, "step": 116 }, { "epoch": 2.67, "grad_norm": 1.667565922975832, "learning_rate": 1e-05, "loss": 0.1455, "step": 117 }, { "epoch": 2.69, "grad_norm": 1.8116940749067432, "learning_rate": 1e-05, "loss": 0.1502, "step": 118 }, { "epoch": 2.71, "grad_norm": 1.9922161139058774, "learning_rate": 1e-05, "loss": 0.1422, "step": 119 }, { "epoch": 2.73, "grad_norm": 2.0124481055349044, "learning_rate": 1e-05, "loss": 0.1373, "step": 120 }, { "epoch": 2.76, "grad_norm": 1.800014793589776, "learning_rate": 1e-05, "loss": 0.1342, "step": 121 }, { "epoch": 2.78, "grad_norm": 1.7532349977825892, "learning_rate": 1e-05, "loss": 0.1262, "step": 122 }, { "epoch": 2.8, "grad_norm": 1.7857219830778754, "learning_rate": 1e-05, "loss": 0.1375, "step": 123 }, { "epoch": 2.83, "grad_norm": 1.7910722257498661, "learning_rate": 1e-05, "loss": 0.148, "step": 124 }, { "epoch": 2.85, "grad_norm": 1.6958921797341877, "learning_rate": 1e-05, "loss": 0.1287, "step": 125 }, { "epoch": 2.87, "grad_norm": 1.9268954167687131, "learning_rate": 1e-05, "loss": 0.1334, "step": 126 }, { "epoch": 2.89, "grad_norm": 1.7925408289852067, "learning_rate": 1e-05, "loss": 0.1445, "step": 127 }, { "epoch": 2.92, "grad_norm": 1.9380903694792881, "learning_rate": 1e-05, "loss": 0.1478, "step": 128 }, { "epoch": 2.94, "grad_norm": 1.8550832947077076, "learning_rate": 1e-05, "loss": 0.1185, "step": 129 }, { "epoch": 2.94, "step": 129, "total_flos": 56540883877888.0, "train_loss": 0.30087860679441647, "train_runtime": 2488.5787, "train_samples_per_second": 6.613, "train_steps_per_second": 0.052 } ], "logging_steps": 1.0, "max_steps": 129, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1.0, "total_flos": 56540883877888.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }