{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.935672514619883, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 2.949489116668701, "learning_rate": 0.0002, "loss": 6.8731, "step": 1 }, { "epoch": 0.02, "grad_norm": 1.4809715747833252, "learning_rate": 0.0002, "loss": 5.7442, "step": 2 }, { "epoch": 0.03, "grad_norm": 2.701913595199585, "learning_rate": 0.0002, "loss": 5.3638, "step": 3 }, { "epoch": 0.04, "grad_norm": 2.2592666149139404, "learning_rate": 0.0002, "loss": 4.7664, "step": 4 }, { "epoch": 0.05, "grad_norm": 1.6057344675064087, "learning_rate": 0.0002, "loss": 4.7581, "step": 5 }, { "epoch": 0.06, "grad_norm": 1.9201135635375977, "learning_rate": 0.0002, "loss": 4.5088, "step": 6 }, { "epoch": 0.07, "grad_norm": 2.3088886737823486, "learning_rate": 0.0002, "loss": 4.1351, "step": 7 }, { "epoch": 0.07, "grad_norm": 3.622396945953369, "learning_rate": 0.0002, "loss": 4.5207, "step": 8 }, { "epoch": 0.08, "grad_norm": 5.80634880065918, "learning_rate": 0.0002, "loss": 4.1157, "step": 9 }, { "epoch": 0.09, "grad_norm": 4.79710578918457, "learning_rate": 0.0002, "loss": 3.3101, "step": 10 }, { "epoch": 0.1, "grad_norm": 7.330948352813721, "learning_rate": 0.0002, "loss": 3.4536, "step": 11 }, { "epoch": 0.11, "grad_norm": 6.117642879486084, "learning_rate": 0.0002, "loss": 3.1297, "step": 12 }, { "epoch": 0.12, "grad_norm": 4.200116157531738, "learning_rate": 0.0002, "loss": 2.5508, "step": 13 }, { "epoch": 0.13, "grad_norm": 3.4331789016723633, "learning_rate": 0.0002, "loss": 2.2032, "step": 14 }, { "epoch": 0.14, "grad_norm": 0.8927509784698486, "learning_rate": 0.0002, "loss": 2.1005, "step": 15 }, { "epoch": 0.15, "grad_norm": 0.9786716103553772, "learning_rate": 0.0002, "loss": 2.1406, "step": 16 }, { "epoch": 0.16, "grad_norm": 0.8695776462554932, "learning_rate": 0.0002, "loss": 1.9371, "step": 17 }, { "epoch": 0.17, "grad_norm": 0.8738767504692078, "learning_rate": 0.0002, "loss": 1.9745, "step": 18 }, { "epoch": 0.18, "grad_norm": 7.5855231285095215, "learning_rate": 0.0002, "loss": 2.0879, "step": 19 }, { "epoch": 0.19, "grad_norm": 1.1819020509719849, "learning_rate": 0.0002, "loss": 1.903, "step": 20 }, { "epoch": 0.2, "grad_norm": 1.0826493501663208, "learning_rate": 0.0002, "loss": 1.9642, "step": 21 }, { "epoch": 0.21, "grad_norm": 0.9060399532318115, "learning_rate": 0.0002, "loss": 1.9963, "step": 22 }, { "epoch": 0.22, "grad_norm": 0.7423937916755676, "learning_rate": 0.0002, "loss": 1.9192, "step": 23 }, { "epoch": 0.22, "grad_norm": 0.6615855097770691, "learning_rate": 0.0002, "loss": 1.8811, "step": 24 }, { "epoch": 0.23, "grad_norm": 0.700908362865448, "learning_rate": 0.0002, "loss": 1.8524, "step": 25 }, { "epoch": 0.24, "grad_norm": 0.5277789831161499, "learning_rate": 0.0002, "loss": 1.8696, "step": 26 }, { "epoch": 0.25, "grad_norm": 0.42331141233444214, "learning_rate": 0.0002, "loss": 1.7229, "step": 27 }, { "epoch": 0.26, "grad_norm": 0.42249181866645813, "learning_rate": 0.0002, "loss": 1.7733, "step": 28 }, { "epoch": 0.27, "grad_norm": 0.4419895112514496, "learning_rate": 0.0002, "loss": 1.7343, "step": 29 }, { "epoch": 0.28, "grad_norm": 0.5018438100814819, "learning_rate": 0.0002, "loss": 1.6586, "step": 30 }, { "epoch": 0.29, "grad_norm": 0.5385031700134277, "learning_rate": 0.0002, "loss": 1.7018, "step": 31 }, { "epoch": 0.3, "grad_norm": 0.4651014506816864, "learning_rate": 0.0002, "loss": 1.7329, "step": 32 }, { "epoch": 0.31, "grad_norm": 0.6264488101005554, "learning_rate": 0.0002, "loss": 1.5911, "step": 33 }, { "epoch": 0.32, "grad_norm": 0.46918559074401855, "learning_rate": 0.0002, "loss": 1.6609, "step": 34 }, { "epoch": 0.33, "grad_norm": 0.6417805552482605, "learning_rate": 0.0002, "loss": 1.6976, "step": 35 }, { "epoch": 0.34, "grad_norm": 0.5029424428939819, "learning_rate": 0.0002, "loss": 1.7227, "step": 36 }, { "epoch": 0.35, "grad_norm": 0.6494549512863159, "learning_rate": 0.0002, "loss": 1.6332, "step": 37 }, { "epoch": 0.36, "grad_norm": 0.5361642241477966, "learning_rate": 0.0002, "loss": 1.6417, "step": 38 }, { "epoch": 0.36, "grad_norm": 0.8852269649505615, "learning_rate": 0.0002, "loss": 1.7352, "step": 39 }, { "epoch": 0.37, "grad_norm": 0.72622150182724, "learning_rate": 0.0002, "loss": 1.6965, "step": 40 }, { "epoch": 0.38, "grad_norm": 0.578904390335083, "learning_rate": 0.0002, "loss": 1.7407, "step": 41 }, { "epoch": 0.39, "grad_norm": 0.5193885564804077, "learning_rate": 0.0002, "loss": 1.6437, "step": 42 }, { "epoch": 0.4, "grad_norm": 0.5035353302955627, "learning_rate": 0.0002, "loss": 1.5735, "step": 43 }, { "epoch": 0.41, "grad_norm": 0.8882142305374146, "learning_rate": 0.0002, "loss": 1.7808, "step": 44 }, { "epoch": 0.42, "grad_norm": 0.5434024333953857, "learning_rate": 0.0002, "loss": 1.638, "step": 45 }, { "epoch": 0.43, "grad_norm": 0.7970778346061707, "learning_rate": 0.0002, "loss": 1.621, "step": 46 }, { "epoch": 0.44, "grad_norm": 0.5033506751060486, "learning_rate": 0.0002, "loss": 1.6199, "step": 47 }, { "epoch": 0.45, "grad_norm": 0.8881465196609497, "learning_rate": 0.0002, "loss": 1.7907, "step": 48 }, { "epoch": 0.46, "grad_norm": 0.741858184337616, "learning_rate": 0.0002, "loss": 1.6585, "step": 49 }, { "epoch": 0.47, "grad_norm": 0.8896893858909607, "learning_rate": 0.0002, "loss": 1.7583, "step": 50 }, { "epoch": 0.48, "grad_norm": 0.6767799854278564, "learning_rate": 0.0002, "loss": 1.6116, "step": 51 }, { "epoch": 0.49, "grad_norm": 0.6831919550895691, "learning_rate": 0.0002, "loss": 1.5806, "step": 52 }, { "epoch": 0.5, "grad_norm": 0.6259914040565491, "learning_rate": 0.0002, "loss": 1.6619, "step": 53 }, { "epoch": 0.51, "grad_norm": 0.9376181960105896, "learning_rate": 0.0002, "loss": 1.6375, "step": 54 }, { "epoch": 0.51, "grad_norm": 0.7372838854789734, "learning_rate": 0.0002, "loss": 1.5599, "step": 55 }, { "epoch": 0.52, "grad_norm": 0.5478132367134094, "learning_rate": 0.0002, "loss": 1.6093, "step": 56 }, { "epoch": 0.53, "grad_norm": 0.9254269599914551, "learning_rate": 0.0002, "loss": 1.7051, "step": 57 }, { "epoch": 0.54, "grad_norm": 0.7650561332702637, "learning_rate": 0.0002, "loss": 1.6095, "step": 58 }, { "epoch": 0.55, "grad_norm": 0.612433135509491, "learning_rate": 0.0002, "loss": 1.5595, "step": 59 }, { "epoch": 0.56, "grad_norm": 0.4704805314540863, "learning_rate": 0.0002, "loss": 1.6785, "step": 60 }, { "epoch": 0.57, "grad_norm": 0.705121636390686, "learning_rate": 0.0002, "loss": 1.6318, "step": 61 }, { "epoch": 0.58, "grad_norm": 0.7150620222091675, "learning_rate": 0.0002, "loss": 1.5746, "step": 62 }, { "epoch": 0.59, "grad_norm": 0.5674924254417419, "learning_rate": 0.0002, "loss": 1.5223, "step": 63 }, { "epoch": 0.6, "grad_norm": 0.5296141505241394, "learning_rate": 0.0002, "loss": 1.5045, "step": 64 }, { "epoch": 0.61, "grad_norm": 0.43603238463401794, "learning_rate": 0.0002, "loss": 1.6686, "step": 65 }, { "epoch": 0.62, "grad_norm": 0.3482309877872467, "learning_rate": 0.0002, "loss": 1.6189, "step": 66 }, { "epoch": 0.63, "grad_norm": 0.4482012987136841, "learning_rate": 0.0002, "loss": 1.6339, "step": 67 }, { "epoch": 0.64, "grad_norm": 0.47700467705726624, "learning_rate": 0.0002, "loss": 1.5819, "step": 68 }, { "epoch": 0.65, "grad_norm": 0.5465943813323975, "learning_rate": 0.0002, "loss": 1.5073, "step": 69 }, { "epoch": 0.65, "grad_norm": 0.41377145051956177, "learning_rate": 0.0002, "loss": 1.5552, "step": 70 }, { "epoch": 0.66, "grad_norm": 0.5941596031188965, "learning_rate": 0.0002, "loss": 1.505, "step": 71 }, { "epoch": 0.67, "grad_norm": 0.6071710586547852, "learning_rate": 0.0002, "loss": 1.558, "step": 72 }, { "epoch": 0.68, "grad_norm": 0.5148259401321411, "learning_rate": 0.0002, "loss": 1.4633, "step": 73 }, { "epoch": 0.69, "grad_norm": 1.30810546875, "learning_rate": 0.0002, "loss": 1.4482, "step": 74 }, { "epoch": 0.7, "grad_norm": 1.4432346820831299, "learning_rate": 0.0002, "loss": 1.5734, "step": 75 }, { "epoch": 0.71, "grad_norm": 0.689092218875885, "learning_rate": 0.0002, "loss": 1.5621, "step": 76 }, { "epoch": 0.72, "grad_norm": 0.6650224328041077, "learning_rate": 0.0002, "loss": 1.5063, "step": 77 }, { "epoch": 0.73, "grad_norm": 0.728783905506134, "learning_rate": 0.0002, "loss": 1.518, "step": 78 }, { "epoch": 0.74, "grad_norm": 0.7941334247589111, "learning_rate": 0.0002, "loss": 1.5284, "step": 79 }, { "epoch": 0.75, "grad_norm": 0.8526914119720459, "learning_rate": 0.0002, "loss": 1.4954, "step": 80 }, { "epoch": 0.76, "grad_norm": 0.9384956359863281, "learning_rate": 0.0002, "loss": 1.5753, "step": 81 }, { "epoch": 0.77, "grad_norm": 0.5513901114463806, "learning_rate": 0.0002, "loss": 1.5513, "step": 82 }, { "epoch": 0.78, "grad_norm": 0.7507538795471191, "learning_rate": 0.0002, "loss": 1.5172, "step": 83 }, { "epoch": 0.79, "grad_norm": 0.40390703082084656, "learning_rate": 0.0002, "loss": 1.5035, "step": 84 }, { "epoch": 0.8, "grad_norm": 0.8725360631942749, "learning_rate": 0.0002, "loss": 1.6118, "step": 85 }, { "epoch": 0.8, "grad_norm": 0.4920073449611664, "learning_rate": 0.0002, "loss": 1.5111, "step": 86 }, { "epoch": 0.81, "grad_norm": 0.5495926737785339, "learning_rate": 0.0002, "loss": 1.5781, "step": 87 }, { "epoch": 0.82, "grad_norm": 0.5326025485992432, "learning_rate": 0.0002, "loss": 1.6479, "step": 88 }, { "epoch": 0.83, "grad_norm": 0.3922731578350067, "learning_rate": 0.0002, "loss": 1.6432, "step": 89 }, { "epoch": 0.84, "grad_norm": 1.0725717544555664, "learning_rate": 0.0002, "loss": 1.6326, "step": 90 }, { "epoch": 0.85, "grad_norm": 0.849716305732727, "learning_rate": 0.0002, "loss": 1.5013, "step": 91 }, { "epoch": 0.86, "grad_norm": 0.3658522963523865, "learning_rate": 0.0002, "loss": 1.5405, "step": 92 }, { "epoch": 0.87, "grad_norm": 0.7757679224014282, "learning_rate": 0.0002, "loss": 1.5823, "step": 93 }, { "epoch": 0.88, "grad_norm": 0.5468668341636658, "learning_rate": 0.0002, "loss": 1.5345, "step": 94 }, { "epoch": 0.89, "grad_norm": 0.28030768036842346, "learning_rate": 0.0002, "loss": 1.4525, "step": 95 }, { "epoch": 0.9, "grad_norm": 0.636431097984314, "learning_rate": 0.0002, "loss": 1.5195, "step": 96 }, { "epoch": 0.91, "grad_norm": 0.5017719268798828, "learning_rate": 0.0002, "loss": 1.4132, "step": 97 }, { "epoch": 0.92, "grad_norm": 0.443892240524292, "learning_rate": 0.0002, "loss": 1.506, "step": 98 }, { "epoch": 0.93, "grad_norm": 0.575356125831604, "learning_rate": 0.0002, "loss": 1.5295, "step": 99 }, { "epoch": 0.94, "grad_norm": 0.4003719687461853, "learning_rate": 0.0002, "loss": 1.491, "step": 100 } ], "logging_steps": 1, "max_steps": 106, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "total_flos": 1.563282263506944e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }