|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.001564945226917, |
|
"eval_steps": 500, |
|
"global_step": 160, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006259780907668232, |
|
"grad_norm": 1.98288817639941, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 0.6599, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.012519561815336464, |
|
"grad_norm": 2.008513351833145, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.6744, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.018779342723004695, |
|
"grad_norm": 2.03144664277006, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.6721, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.025039123630672927, |
|
"grad_norm": 1.9480725202469245, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.6577, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03129890453834116, |
|
"grad_norm": 1.8678118004054254, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.6484, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03755868544600939, |
|
"grad_norm": 1.6583787538868422, |
|
"learning_rate": 3e-06, |
|
"loss": 0.6174, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04381846635367762, |
|
"grad_norm": 1.5614405714896737, |
|
"learning_rate": 3.5000000000000004e-06, |
|
"loss": 0.5896, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.050078247261345854, |
|
"grad_norm": 0.5773143053283745, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.5557, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.056338028169014086, |
|
"grad_norm": 0.3043811484340276, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.541, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06259780907668232, |
|
"grad_norm": 0.8131531353366078, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5595, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06885758998435054, |
|
"grad_norm": 0.424180567084822, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 0.5427, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07511737089201878, |
|
"grad_norm": 0.2913041969769501, |
|
"learning_rate": 6e-06, |
|
"loss": 0.5274, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.081377151799687, |
|
"grad_norm": 0.34524917385772347, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 0.5337, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.08763693270735524, |
|
"grad_norm": 0.36469195568794854, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 0.5279, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.09389671361502347, |
|
"grad_norm": 0.35209082489157323, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.5296, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.10015649452269171, |
|
"grad_norm": 0.28086156745404856, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.5319, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.10641627543035993, |
|
"grad_norm": 0.5457849868763605, |
|
"learning_rate": 8.500000000000002e-06, |
|
"loss": 0.5199, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.11267605633802817, |
|
"grad_norm": 0.264594169690208, |
|
"learning_rate": 9e-06, |
|
"loss": 0.5234, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1189358372456964, |
|
"grad_norm": 0.2472097021778676, |
|
"learning_rate": 9.5e-06, |
|
"loss": 0.5248, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.12519561815336464, |
|
"grad_norm": 0.2560549908847749, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5159, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13145539906103287, |
|
"grad_norm": 0.4101523009554862, |
|
"learning_rate": 1.05e-05, |
|
"loss": 0.5058, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.13771517996870108, |
|
"grad_norm": 0.22290433425318873, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 0.5099, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.14397496087636932, |
|
"grad_norm": 0.2600145857043661, |
|
"learning_rate": 1.1500000000000002e-05, |
|
"loss": 0.5076, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.15023474178403756, |
|
"grad_norm": 1.1584269063197106, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.5133, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1564945226917058, |
|
"grad_norm": 0.21303015786105067, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.5009, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.162754303599374, |
|
"grad_norm": 2.5709430754104345, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 0.5067, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.16901408450704225, |
|
"grad_norm": 0.42260631876680255, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 0.4951, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.1752738654147105, |
|
"grad_norm": 0.2122989372030049, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 0.4968, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.18153364632237873, |
|
"grad_norm": 0.36382001881720555, |
|
"learning_rate": 1.45e-05, |
|
"loss": 0.5035, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.18779342723004694, |
|
"grad_norm": 0.22094603076455596, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.5049, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.19405320813771518, |
|
"grad_norm": 0.17188920546056902, |
|
"learning_rate": 1.55e-05, |
|
"loss": 0.4979, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.20031298904538342, |
|
"grad_norm": 0.18515458685485783, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.4916, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.20657276995305165, |
|
"grad_norm": 0.783356101762532, |
|
"learning_rate": 1.65e-05, |
|
"loss": 0.4929, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.21283255086071987, |
|
"grad_norm": 0.19059224326067628, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 0.4945, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2190923317683881, |
|
"grad_norm": 0.2275442577977743, |
|
"learning_rate": 1.75e-05, |
|
"loss": 0.4936, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.22535211267605634, |
|
"grad_norm": 0.24798149507141237, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.4898, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.23161189358372458, |
|
"grad_norm": 0.20682357544778035, |
|
"learning_rate": 1.85e-05, |
|
"loss": 0.4888, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2378716744913928, |
|
"grad_norm": 0.19518819682961547, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.4899, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.24413145539906103, |
|
"grad_norm": 0.18423871547579748, |
|
"learning_rate": 1.9500000000000003e-05, |
|
"loss": 0.4868, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.25039123630672927, |
|
"grad_norm": 0.1714820355275791, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4795, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2566510172143975, |
|
"grad_norm": 0.19187618384155788, |
|
"learning_rate": 2.05e-05, |
|
"loss": 0.4821, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.26291079812206575, |
|
"grad_norm": 0.1422378326228944, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.4829, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.26917057902973396, |
|
"grad_norm": 0.14724977757162294, |
|
"learning_rate": 2.15e-05, |
|
"loss": 0.4811, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.27543035993740217, |
|
"grad_norm": 0.16077227738580077, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 0.477, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.28169014084507044, |
|
"grad_norm": 0.15993679259901028, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.4789, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.28794992175273865, |
|
"grad_norm": 0.14385134377084383, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 0.4641, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2942097026604069, |
|
"grad_norm": 0.14244559356804792, |
|
"learning_rate": 2.35e-05, |
|
"loss": 0.4767, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3004694835680751, |
|
"grad_norm": 0.1481660114240819, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.4759, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.30672926447574334, |
|
"grad_norm": 0.14195363156015162, |
|
"learning_rate": 2.45e-05, |
|
"loss": 0.471, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3129890453834116, |
|
"grad_norm": 0.15220552720898642, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.4715, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3192488262910798, |
|
"grad_norm": 0.13409784658365015, |
|
"learning_rate": 2.5500000000000003e-05, |
|
"loss": 0.4692, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.325508607198748, |
|
"grad_norm": 0.13766694658848178, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.47, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3317683881064163, |
|
"grad_norm": 0.13097864679643595, |
|
"learning_rate": 2.6500000000000004e-05, |
|
"loss": 0.4651, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3380281690140845, |
|
"grad_norm": 0.13207003285729219, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 0.4714, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3442879499217527, |
|
"grad_norm": 0.14128427173382038, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.4719, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.350547730829421, |
|
"grad_norm": 0.13599048333974484, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 0.4657, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3568075117370892, |
|
"grad_norm": 0.1547358977814178, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 0.4599, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.36306729264475746, |
|
"grad_norm": 0.1357320992255676, |
|
"learning_rate": 2.9e-05, |
|
"loss": 0.4615, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.36932707355242567, |
|
"grad_norm": 0.14465717873045295, |
|
"learning_rate": 2.95e-05, |
|
"loss": 0.4738, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.3755868544600939, |
|
"grad_norm": 0.5900603203611421, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4702, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.38184663536776214, |
|
"grad_norm": 0.17729474902277623, |
|
"learning_rate": 3.05e-05, |
|
"loss": 0.4592, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.38810641627543035, |
|
"grad_norm": 0.22055664690525556, |
|
"learning_rate": 3.1e-05, |
|
"loss": 0.47, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.39436619718309857, |
|
"grad_norm": 0.22917133262033845, |
|
"learning_rate": 3.15e-05, |
|
"loss": 0.4668, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.40062597809076683, |
|
"grad_norm": 0.23278911760289017, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 0.4691, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.40688575899843504, |
|
"grad_norm": 0.23911939507472177, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 0.4662, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4131455399061033, |
|
"grad_norm": 0.19447041878105836, |
|
"learning_rate": 3.3e-05, |
|
"loss": 0.4633, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4194053208137715, |
|
"grad_norm": 0.17498726962496755, |
|
"learning_rate": 3.35e-05, |
|
"loss": 0.4654, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.42566510172143973, |
|
"grad_norm": 0.24918375228266929, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 0.477, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.431924882629108, |
|
"grad_norm": 0.2850664865678729, |
|
"learning_rate": 3.45e-05, |
|
"loss": 0.4648, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.4381846635367762, |
|
"grad_norm": 0.27562629972396513, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.4667, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.22637202856522412, |
|
"learning_rate": 3.55e-05, |
|
"loss": 0.4653, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4507042253521127, |
|
"grad_norm": 0.2295442026728235, |
|
"learning_rate": 3.6e-05, |
|
"loss": 0.4622, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.4569640062597809, |
|
"grad_norm": 0.26572612655057165, |
|
"learning_rate": 3.65e-05, |
|
"loss": 0.4673, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.46322378716744916, |
|
"grad_norm": 0.2496817546620412, |
|
"learning_rate": 3.7e-05, |
|
"loss": 0.4611, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4694835680751174, |
|
"grad_norm": 0.21430723659191686, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.4637, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4757433489827856, |
|
"grad_norm": 0.1799606207168491, |
|
"learning_rate": 3.8e-05, |
|
"loss": 0.4612, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.48200312989045385, |
|
"grad_norm": 0.2329269891744439, |
|
"learning_rate": 3.85e-05, |
|
"loss": 0.4569, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.48826291079812206, |
|
"grad_norm": 0.2859704851548014, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 0.4677, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.4945226917057903, |
|
"grad_norm": 0.3153100598444141, |
|
"learning_rate": 3.9500000000000005e-05, |
|
"loss": 0.465, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5007824726134585, |
|
"grad_norm": 0.3165950932566608, |
|
"learning_rate": 4e-05, |
|
"loss": 0.4755, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5070422535211268, |
|
"grad_norm": 0.3018577292754275, |
|
"learning_rate": 4.05e-05, |
|
"loss": 0.464, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.513302034428795, |
|
"grad_norm": 0.39363558044861696, |
|
"learning_rate": 4.1e-05, |
|
"loss": 0.4701, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5195618153364632, |
|
"grad_norm": 0.44171413078007776, |
|
"learning_rate": 4.15e-05, |
|
"loss": 0.4697, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5258215962441315, |
|
"grad_norm": 0.4086449510625894, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.4611, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5320813771517997, |
|
"grad_norm": 0.3156689305434587, |
|
"learning_rate": 4.25e-05, |
|
"loss": 0.4633, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5383411580594679, |
|
"grad_norm": 0.37582415992669976, |
|
"learning_rate": 4.3e-05, |
|
"loss": 0.4689, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5446009389671361, |
|
"grad_norm": 0.3751728997948819, |
|
"learning_rate": 4.35e-05, |
|
"loss": 0.4658, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5508607198748043, |
|
"grad_norm": 0.2622604607003995, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.4641, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5571205007824727, |
|
"grad_norm": 0.27806769516567914, |
|
"learning_rate": 4.4500000000000004e-05, |
|
"loss": 0.4689, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5633802816901409, |
|
"grad_norm": 0.37193892514568727, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.4645, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5696400625978091, |
|
"grad_norm": 0.319234610988282, |
|
"learning_rate": 4.55e-05, |
|
"loss": 0.4697, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5758998435054773, |
|
"grad_norm": 0.24391835650924631, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.4605, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5821596244131455, |
|
"grad_norm": 0.3860119064167233, |
|
"learning_rate": 4.6500000000000005e-05, |
|
"loss": 0.4721, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.5884194053208138, |
|
"grad_norm": 0.43978262147491526, |
|
"learning_rate": 4.7e-05, |
|
"loss": 0.4692, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.594679186228482, |
|
"grad_norm": 0.2869109051387356, |
|
"learning_rate": 4.75e-05, |
|
"loss": 0.4644, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6009389671361502, |
|
"grad_norm": 0.33046074741721215, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.4711, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6071987480438185, |
|
"grad_norm": 0.3874189152162858, |
|
"learning_rate": 4.85e-05, |
|
"loss": 0.4694, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6134585289514867, |
|
"grad_norm": 0.46318630797414556, |
|
"learning_rate": 4.9e-05, |
|
"loss": 0.4741, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6197183098591549, |
|
"grad_norm": 0.6037444606802089, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 0.4754, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6259780907668232, |
|
"grad_norm": 0.5037059436389102, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4739, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6322378716744914, |
|
"grad_norm": 0.5631190436137139, |
|
"learning_rate": 4.9997404092249336e-05, |
|
"loss": 0.4699, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6384976525821596, |
|
"grad_norm": 0.39119483297638863, |
|
"learning_rate": 4.998961690809628e-05, |
|
"loss": 0.4703, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6447574334898278, |
|
"grad_norm": 0.40196303529424704, |
|
"learning_rate": 4.997664006472579e-05, |
|
"loss": 0.4749, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.651017214397496, |
|
"grad_norm": 0.3397733110278162, |
|
"learning_rate": 4.9958476257072914e-05, |
|
"loss": 0.4654, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6572769953051644, |
|
"grad_norm": 0.2670846226151608, |
|
"learning_rate": 4.993512925726319e-05, |
|
"loss": 0.4716, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6635367762128326, |
|
"grad_norm": 0.36681659702689784, |
|
"learning_rate": 4.990660391382923e-05, |
|
"loss": 0.4704, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6697965571205008, |
|
"grad_norm": 0.26058292855009557, |
|
"learning_rate": 4.987290615070385e-05, |
|
"loss": 0.4647, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.676056338028169, |
|
"grad_norm": 0.25205128219384887, |
|
"learning_rate": 4.983404296598979e-05, |
|
"loss": 0.4725, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.6823161189358372, |
|
"grad_norm": 0.3208687051782515, |
|
"learning_rate": 4.9790022430506463e-05, |
|
"loss": 0.471, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.6885758998435054, |
|
"grad_norm": 0.2306209439140453, |
|
"learning_rate": 4.974085368611381e-05, |
|
"loss": 0.473, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6948356807511737, |
|
"grad_norm": 0.21458192536569118, |
|
"learning_rate": 4.968654694381379e-05, |
|
"loss": 0.4692, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.701095461658842, |
|
"grad_norm": 0.24400329234341836, |
|
"learning_rate": 4.962711348162987e-05, |
|
"loss": 0.4742, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7073552425665102, |
|
"grad_norm": 0.5445701250609367, |
|
"learning_rate": 4.956256564226487e-05, |
|
"loss": 0.4677, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7136150234741784, |
|
"grad_norm": 0.2485591152431222, |
|
"learning_rate": 4.949291683053769e-05, |
|
"loss": 0.478, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7198748043818466, |
|
"grad_norm": 0.2683190648451619, |
|
"learning_rate": 4.941818151059956e-05, |
|
"loss": 0.468, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7261345852895149, |
|
"grad_norm": 0.17377296116604452, |
|
"learning_rate": 4.933837520293017e-05, |
|
"loss": 0.4682, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7323943661971831, |
|
"grad_norm": 0.19892874090328266, |
|
"learning_rate": 4.9253514481114535e-05, |
|
"loss": 0.4716, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7386541471048513, |
|
"grad_norm": 0.22470516800088272, |
|
"learning_rate": 4.91636169684011e-05, |
|
"loss": 0.4807, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7449139280125195, |
|
"grad_norm": 0.23033947133081567, |
|
"learning_rate": 4.906870133404187e-05, |
|
"loss": 0.4721, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7511737089201878, |
|
"grad_norm": 0.2764527709442302, |
|
"learning_rate": 4.896878728941531e-05, |
|
"loss": 0.4693, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7574334898278561, |
|
"grad_norm": 0.28746556965081915, |
|
"learning_rate": 4.8863895583932836e-05, |
|
"loss": 0.4767, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7636932707355243, |
|
"grad_norm": 0.32061574884194566, |
|
"learning_rate": 4.875404800072977e-05, |
|
"loss": 0.4643, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7699530516431925, |
|
"grad_norm": 0.34181281337669966, |
|
"learning_rate": 4.86392673521415e-05, |
|
"loss": 0.4602, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.7762128325508607, |
|
"grad_norm": 0.30941984507586506, |
|
"learning_rate": 4.8519577474966074e-05, |
|
"loss": 0.4711, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.7824726134585289, |
|
"grad_norm": 0.23600978038755785, |
|
"learning_rate": 4.839500322551386e-05, |
|
"loss": 0.4696, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7887323943661971, |
|
"grad_norm": 0.2577164285099203, |
|
"learning_rate": 4.8265570474445636e-05, |
|
"loss": 0.4644, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.7949921752738655, |
|
"grad_norm": 0.27823451721774306, |
|
"learning_rate": 4.813130610139994e-05, |
|
"loss": 0.479, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.8012519561815337, |
|
"grad_norm": 0.22061524932206344, |
|
"learning_rate": 4.7992237989410904e-05, |
|
"loss": 0.4711, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8075117370892019, |
|
"grad_norm": 0.20216340578684158, |
|
"learning_rate": 4.784839501911771e-05, |
|
"loss": 0.468, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.8137715179968701, |
|
"grad_norm": 0.27542745611047786, |
|
"learning_rate": 4.7699807062766876e-05, |
|
"loss": 0.4754, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8200312989045383, |
|
"grad_norm": 0.21954180738847087, |
|
"learning_rate": 4.75465049780086e-05, |
|
"loss": 0.4595, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.8262910798122066, |
|
"grad_norm": 0.19430624161738624, |
|
"learning_rate": 4.738852060148849e-05, |
|
"loss": 0.4747, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8325508607198748, |
|
"grad_norm": 0.1884671644058954, |
|
"learning_rate": 4.722588674223594e-05, |
|
"loss": 0.4748, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.838810641627543, |
|
"grad_norm": 0.20913369047927102, |
|
"learning_rate": 4.7058637174850604e-05, |
|
"loss": 0.4653, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.8450704225352113, |
|
"grad_norm": 0.19564021089464265, |
|
"learning_rate": 4.688680663248837e-05, |
|
"loss": 0.4644, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8513302034428795, |
|
"grad_norm": 0.17437877798570775, |
|
"learning_rate": 4.671043079964815e-05, |
|
"loss": 0.4666, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8575899843505478, |
|
"grad_norm": 0.18658537333186465, |
|
"learning_rate": 4.652954630476127e-05, |
|
"loss": 0.463, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.863849765258216, |
|
"grad_norm": 0.1916983418252378, |
|
"learning_rate": 4.634419071258472e-05, |
|
"loss": 0.4801, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8701095461658842, |
|
"grad_norm": 0.18269150591223743, |
|
"learning_rate": 4.615440251639995e-05, |
|
"loss": 0.465, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.8763693270735524, |
|
"grad_norm": 0.19124021712384207, |
|
"learning_rate": 4.5960221130018946e-05, |
|
"loss": 0.4624, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8826291079812206, |
|
"grad_norm": 0.17751289300487907, |
|
"learning_rate": 4.576168687959895e-05, |
|
"loss": 0.4667, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.16256598664527863, |
|
"learning_rate": 4.555884099526794e-05, |
|
"loss": 0.4724, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.8951486697965572, |
|
"grad_norm": 0.17306660659668968, |
|
"learning_rate": 4.535172560256218e-05, |
|
"loss": 0.4764, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.9014084507042254, |
|
"grad_norm": 0.15311694878287935, |
|
"learning_rate": 4.5140383713677916e-05, |
|
"loss": 0.4633, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.9076682316118936, |
|
"grad_norm": 0.16327033693685952, |
|
"learning_rate": 4.492485921853894e-05, |
|
"loss": 0.4626, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9139280125195618, |
|
"grad_norm": 0.1577015015575217, |
|
"learning_rate": 4.4705196875681854e-05, |
|
"loss": 0.465, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.92018779342723, |
|
"grad_norm": 0.14976303947345634, |
|
"learning_rate": 4.448144230296093e-05, |
|
"loss": 0.4732, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.9264475743348983, |
|
"grad_norm": 0.1799041852337434, |
|
"learning_rate": 4.425364196807451e-05, |
|
"loss": 0.4638, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9327073552425665, |
|
"grad_norm": 0.25582934784311545, |
|
"learning_rate": 4.402184317891501e-05, |
|
"loss": 0.4687, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.9389671361502347, |
|
"grad_norm": 0.14767269207211267, |
|
"learning_rate": 4.37860940737443e-05, |
|
"loss": 0.4622, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.945226917057903, |
|
"grad_norm": 0.18510146862998086, |
|
"learning_rate": 4.354644361119672e-05, |
|
"loss": 0.4714, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.9514866979655712, |
|
"grad_norm": 0.1834113544053396, |
|
"learning_rate": 4.330294156011172e-05, |
|
"loss": 0.4665, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9577464788732394, |
|
"grad_norm": 0.16106024098596552, |
|
"learning_rate": 4.305563848919824e-05, |
|
"loss": 0.4612, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9640062597809077, |
|
"grad_norm": 0.1582714001537092, |
|
"learning_rate": 4.2804585756532965e-05, |
|
"loss": 0.4656, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.9702660406885759, |
|
"grad_norm": 0.1838011411088347, |
|
"learning_rate": 4.254983549889467e-05, |
|
"loss": 0.4585, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9765258215962441, |
|
"grad_norm": 0.22256207898681857, |
|
"learning_rate": 4.2291440620936796e-05, |
|
"loss": 0.4712, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.9827856025039123, |
|
"grad_norm": 0.16842112143070276, |
|
"learning_rate": 4.2029454784200676e-05, |
|
"loss": 0.4691, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.9890453834115805, |
|
"grad_norm": 0.15122887832488566, |
|
"learning_rate": 4.176393239597144e-05, |
|
"loss": 0.4778, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.9953051643192489, |
|
"grad_norm": 0.1902639072378955, |
|
"learning_rate": 4.149492859797912e-05, |
|
"loss": 0.4688, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.001564945226917, |
|
"grad_norm": 0.17200971150006397, |
|
"learning_rate": 4.122249925494726e-05, |
|
"loss": 0.464, |
|
"step": 160 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 318, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 16, |
|
"total_flos": 1.0356139229184e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|