|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 1548, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01937984496124031, |
|
"grad_norm": 3.144579840345173, |
|
"learning_rate": 2e-06, |
|
"loss": 0.7225, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03875968992248062, |
|
"grad_norm": 1.4635286655772066, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6518, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05813953488372093, |
|
"grad_norm": 1.3391923941713981, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6267, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07751937984496124, |
|
"grad_norm": 1.6579265991584125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6192, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09689922480620156, |
|
"grad_norm": 1.4605383147970585, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6139, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11627906976744186, |
|
"grad_norm": 3.0164024994264924, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6069, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13565891472868216, |
|
"grad_norm": 3.0735559741358043, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6026, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15503875968992248, |
|
"grad_norm": 2.179805880693448, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6052, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1744186046511628, |
|
"grad_norm": 1.362604075543814, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5906, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1937984496124031, |
|
"grad_norm": 1.3800184946057543, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5926, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2131782945736434, |
|
"grad_norm": 1.5751853135874878, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5956, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.23255813953488372, |
|
"grad_norm": 1.3581579120999256, |
|
"learning_rate": 2e-06, |
|
"loss": 0.587, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.25193798449612403, |
|
"grad_norm": 1.4725915485732346, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5969, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2713178294573643, |
|
"grad_norm": 2.3468819417923297, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5835, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.29069767441860467, |
|
"grad_norm": 1.7250281157703675, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5835, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.31007751937984496, |
|
"grad_norm": 1.697210370789681, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5818, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.32945736434108525, |
|
"grad_norm": 1.4878874477377635, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5766, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3488372093023256, |
|
"grad_norm": 1.4577800027661107, |
|
"learning_rate": 2e-06, |
|
"loss": 0.581, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3682170542635659, |
|
"grad_norm": 1.5593039073057922, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5743, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3875968992248062, |
|
"grad_norm": 1.5848277223563478, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5818, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4069767441860465, |
|
"grad_norm": 1.8749305561105472, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5794, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4263565891472868, |
|
"grad_norm": 1.9776238010410143, |
|
"learning_rate": 2e-06, |
|
"loss": 0.568, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.44573643410852715, |
|
"grad_norm": 1.8367468157602185, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5663, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"grad_norm": 1.4796763755300497, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5701, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4844961240310077, |
|
"grad_norm": 1.3277578070972216, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5709, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5038759689922481, |
|
"grad_norm": 1.3619348918199545, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5705, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5232558139534884, |
|
"grad_norm": 1.3242792342849865, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5712, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5426356589147286, |
|
"grad_norm": 1.208195333886979, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5666, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.562015503875969, |
|
"grad_norm": 1.2067351750756312, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5689, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5813953488372093, |
|
"grad_norm": 1.4904068289049572, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5771, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6007751937984496, |
|
"grad_norm": 1.2308496450990478, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5641, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6201550387596899, |
|
"grad_norm": 1.646171689094913, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5615, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6395348837209303, |
|
"grad_norm": 1.486911287484255, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5632, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6589147286821705, |
|
"grad_norm": 1.367582942415251, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5592, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6782945736434108, |
|
"grad_norm": 1.6042437679665664, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5563, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6976744186046512, |
|
"grad_norm": 1.3440772301905874, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5611, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7170542635658915, |
|
"grad_norm": 1.5512652472297026, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5665, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7364341085271318, |
|
"grad_norm": 1.2275175025786087, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5672, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7558139534883721, |
|
"grad_norm": 1.584906298076075, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5566, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7751937984496124, |
|
"grad_norm": 1.2472749697240508, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5576, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7945736434108527, |
|
"grad_norm": 1.3014141414308267, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5629, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.813953488372093, |
|
"grad_norm": 1.1170221459908387, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5579, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 1.2375437974864687, |
|
"learning_rate": 2e-06, |
|
"loss": 0.561, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8527131782945736, |
|
"grad_norm": 1.2886237743320625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5593, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.872093023255814, |
|
"grad_norm": 1.2840806861492629, |
|
"learning_rate": 2e-06, |
|
"loss": 0.563, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8914728682170543, |
|
"grad_norm": 1.1374554276601674, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5518, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9108527131782945, |
|
"grad_norm": 1.5460088934957399, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5581, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"grad_norm": 1.3649006878214347, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5586, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9496124031007752, |
|
"grad_norm": 1.4127688073512672, |
|
"learning_rate": 2e-06, |
|
"loss": 0.552, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9689922480620154, |
|
"grad_norm": 1.6637865739686932, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5529, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9883720930232558, |
|
"grad_norm": 1.1603715309408054, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5575, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.0077519379844961, |
|
"grad_norm": 1.9471029922347483, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5363, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.0271317829457365, |
|
"grad_norm": 1.501943317728709, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5103, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.0465116279069768, |
|
"grad_norm": 2.210385554433354, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5083, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0658914728682172, |
|
"grad_norm": 1.4456437528958064, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5059, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0852713178294573, |
|
"grad_norm": 1.3747977066368522, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5169, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.1046511627906976, |
|
"grad_norm": 1.3671143459062107, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5102, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.124031007751938, |
|
"grad_norm": 1.6112053859893298, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5044, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.1434108527131783, |
|
"grad_norm": 1.2311741203329047, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5036, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.1627906976744187, |
|
"grad_norm": 1.2615707450710407, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5037, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1821705426356588, |
|
"grad_norm": 1.247560640905585, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5135, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.2015503875968991, |
|
"grad_norm": 1.4713651285134284, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5078, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.2209302325581395, |
|
"grad_norm": 1.8788503215188699, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5078, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.2403100775193798, |
|
"grad_norm": 1.9570944633210832, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5088, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.2596899224806202, |
|
"grad_norm": 1.519042994554157, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5098, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.2790697674418605, |
|
"grad_norm": 1.7929648758582426, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5067, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.2984496124031009, |
|
"grad_norm": 1.4987560500572878, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5122, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.3178294573643412, |
|
"grad_norm": 1.2086756990648462, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5121, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.3372093023255813, |
|
"grad_norm": 1.3866302425582229, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5088, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.3565891472868217, |
|
"grad_norm": 1.4943167093290222, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5099, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.375968992248062, |
|
"grad_norm": 1.4190704125897742, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5136, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.3953488372093024, |
|
"grad_norm": 1.201854415277148, |
|
"learning_rate": 2e-06, |
|
"loss": 0.513, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.4147286821705427, |
|
"grad_norm": 1.5552807283991774, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5087, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.4341085271317828, |
|
"grad_norm": 1.2711570348118688, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5148, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.4534883720930232, |
|
"grad_norm": 1.5027858680147197, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5101, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.4728682170542635, |
|
"grad_norm": 1.3009800268853209, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5075, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.4922480620155039, |
|
"grad_norm": 1.195133004986583, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5111, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.5116279069767442, |
|
"grad_norm": 1.3544355872789209, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5136, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.5310077519379846, |
|
"grad_norm": 1.22732892168229, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5107, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.550387596899225, |
|
"grad_norm": 1.2500964364741391, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5076, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.5697674418604652, |
|
"grad_norm": 1.2443445799817128, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5059, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.5891472868217056, |
|
"grad_norm": 1.3642427980887037, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5047, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.6085271317829457, |
|
"grad_norm": 1.3132643052702235, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5126, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.627906976744186, |
|
"grad_norm": 1.245951139796731, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5072, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.6472868217054264, |
|
"grad_norm": 1.151992957120151, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5119, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 1.44482076559048, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5143, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.6860465116279069, |
|
"grad_norm": 1.2014633120651383, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5065, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.7054263565891472, |
|
"grad_norm": 1.1421759011867347, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5112, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.7248062015503876, |
|
"grad_norm": 1.3662761429999075, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5105, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.744186046511628, |
|
"grad_norm": 1.731081249332488, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5145, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.7635658914728682, |
|
"grad_norm": 1.8386868390815672, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5064, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.7829457364341086, |
|
"grad_norm": 1.945733381736797, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5092, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.802325581395349, |
|
"grad_norm": 1.1658591371712888, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5102, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.8217054263565893, |
|
"grad_norm": 1.2964175995286236, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5084, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.8410852713178296, |
|
"grad_norm": 1.368071275379517, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5109, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.8604651162790697, |
|
"grad_norm": 1.150103079484035, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5129, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.87984496124031, |
|
"grad_norm": 1.741489344099722, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5102, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.8992248062015504, |
|
"grad_norm": 1.4368003237003146, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5063, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.9186046511627906, |
|
"grad_norm": 1.198714228722542, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5125, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.937984496124031, |
|
"grad_norm": 1.2050714331345398, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5098, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.9573643410852712, |
|
"grad_norm": 1.2467265326889772, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5111, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.9767441860465116, |
|
"grad_norm": 1.2245457081729416, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5064, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.996124031007752, |
|
"grad_norm": 1.3046841459793526, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5104, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.0155038759689923, |
|
"grad_norm": 1.69236342694069, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4698, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.0348837209302326, |
|
"grad_norm": 1.482227404306843, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4592, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.054263565891473, |
|
"grad_norm": 1.472873774310932, |
|
"learning_rate": 2e-06, |
|
"loss": 0.458, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.0736434108527133, |
|
"grad_norm": 1.5478690868120868, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4595, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.0930232558139537, |
|
"grad_norm": 1.3832474558503782, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4544, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.112403100775194, |
|
"grad_norm": 1.4739915975741762, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4575, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.1317829457364343, |
|
"grad_norm": 1.4093716459091368, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4574, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.1511627906976742, |
|
"grad_norm": 1.4597325989125942, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4604, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.1705426356589146, |
|
"grad_norm": 1.2596618218820486, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4622, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.189922480620155, |
|
"grad_norm": 1.8149063252784816, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4592, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.2093023255813953, |
|
"grad_norm": 1.552162424271938, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4572, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.2286821705426356, |
|
"grad_norm": 1.4310808330644234, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4596, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.248062015503876, |
|
"grad_norm": 1.1994256997594486, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4576, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.2674418604651163, |
|
"grad_norm": 1.539587341902846, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4584, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.2868217054263567, |
|
"grad_norm": 1.333352666406639, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4654, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.306201550387597, |
|
"grad_norm": 1.443832387313447, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4606, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.3255813953488373, |
|
"grad_norm": 1.2947150626163912, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4569, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.3449612403100777, |
|
"grad_norm": 1.2733884019792792, |
|
"learning_rate": 2e-06, |
|
"loss": 0.461, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.3643410852713176, |
|
"grad_norm": 1.2445241956347013, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4646, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.383720930232558, |
|
"grad_norm": 1.3643853680636246, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4598, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.4031007751937983, |
|
"grad_norm": 1.3508030176182113, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4659, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.4224806201550386, |
|
"grad_norm": 1.4315788663602833, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4606, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.441860465116279, |
|
"grad_norm": 1.2723100725181913, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4629, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.4612403100775193, |
|
"grad_norm": 1.3005627010479606, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4643, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.4806201550387597, |
|
"grad_norm": 1.3858018789769277, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4647, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.441079130557343, |
|
"learning_rate": 2e-06, |
|
"loss": 0.458, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.5193798449612403, |
|
"grad_norm": 1.20329198310741, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4651, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.5387596899224807, |
|
"grad_norm": 1.2598982072621057, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4584, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.558139534883721, |
|
"grad_norm": 1.2079452297185667, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4707, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.5775193798449614, |
|
"grad_norm": 1.2529237840070613, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4628, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.5968992248062017, |
|
"grad_norm": 1.524625077175533, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4716, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.616279069767442, |
|
"grad_norm": 1.32372209621441, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4629, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.6356589147286824, |
|
"grad_norm": 1.5832090394160376, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4595, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.6550387596899228, |
|
"grad_norm": 1.2622980294640742, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4605, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.6744186046511627, |
|
"grad_norm": 1.235197532731858, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4687, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.693798449612403, |
|
"grad_norm": 1.2791276926102568, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4603, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.7131782945736433, |
|
"grad_norm": 1.2282558958052718, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4683, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.7325581395348837, |
|
"grad_norm": 1.3538612143623057, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4601, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.751937984496124, |
|
"grad_norm": 1.3291955159874183, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4617, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.7713178294573644, |
|
"grad_norm": 1.2058531615141939, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4671, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.7906976744186047, |
|
"grad_norm": 1.3306346445143116, |
|
"learning_rate": 2e-06, |
|
"loss": 0.466, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.810077519379845, |
|
"grad_norm": 1.4572481853030812, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4637, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.8294573643410854, |
|
"grad_norm": 1.4181222197142325, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4707, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.8488372093023253, |
|
"grad_norm": 1.2329063854513693, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4666, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.8682170542635657, |
|
"grad_norm": 1.4499016075739342, |
|
"learning_rate": 2e-06, |
|
"loss": 0.467, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.887596899224806, |
|
"grad_norm": 1.4376308105469073, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4699, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.9069767441860463, |
|
"grad_norm": 1.3931682854746266, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4592, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.9263565891472867, |
|
"grad_norm": 1.3848029694873079, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4633, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.945736434108527, |
|
"grad_norm": 1.2870479115717015, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4684, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.9651162790697674, |
|
"grad_norm": 1.348746763089945, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4635, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.9844961240310077, |
|
"grad_norm": 1.2816015840213701, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4687, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1548, |
|
"total_flos": 2592120137318400.0, |
|
"train_loss": 0.5171875406913363, |
|
"train_runtime": 22202.3543, |
|
"train_samples_per_second": 35.668, |
|
"train_steps_per_second": 0.07 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1548, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2592120137318400.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|