diff --git "a/checkpoint-13000/trainer_state.json" "b/checkpoint-13000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-13000/trainer_state.json" @@ -0,0 +1,9285 @@ +{ + "best_metric": 0.3308734893798828, + "best_model_checkpoint": "results_mt5XLSum_augmented/checkpoint-13000", + "epoch": 9.285714285714286, + "eval_steps": 1000, + "global_step": 13000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007142857142857143, + "grad_norm": 10.661067962646484, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.8382, + "step": 10 + }, + { + "epoch": 0.014285714285714285, + "grad_norm": 11.73471450805664, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.6838, + "step": 20 + }, + { + "epoch": 0.02142857142857143, + "grad_norm": 13.098968505859375, + "learning_rate": 3e-06, + "loss": 1.86, + "step": 30 + }, + { + "epoch": 0.02857142857142857, + "grad_norm": 10.79481315612793, + "learning_rate": 4.000000000000001e-06, + "loss": 1.7851, + "step": 40 + }, + { + "epoch": 0.03571428571428571, + "grad_norm": 11.62800121307373, + "learning_rate": 5e-06, + "loss": 1.4648, + "step": 50 + }, + { + "epoch": 0.04285714285714286, + "grad_norm": 9.00180721282959, + "learning_rate": 6e-06, + "loss": 1.4355, + "step": 60 + }, + { + "epoch": 0.05, + "grad_norm": 11.218201637268066, + "learning_rate": 7.000000000000001e-06, + "loss": 1.3977, + "step": 70 + }, + { + "epoch": 0.05714285714285714, + "grad_norm": 7.159872531890869, + "learning_rate": 8.000000000000001e-06, + "loss": 0.9774, + "step": 80 + }, + { + "epoch": 0.06428571428571428, + "grad_norm": 6.163649559020996, + "learning_rate": 9e-06, + "loss": 0.8556, + "step": 90 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 4.800461292266846, + "learning_rate": 1e-05, + "loss": 0.8627, + "step": 100 + }, + { + "epoch": 0.07857142857142857, + "grad_norm": 4.373474597930908, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.7674, + "step": 110 + }, + { + "epoch": 0.08571428571428572, + "grad_norm": 4.36292839050293, + "learning_rate": 1.2e-05, + "loss": 0.7035, + "step": 120 + }, + { + "epoch": 0.09285714285714286, + "grad_norm": 4.634104251861572, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.9197, + "step": 130 + }, + { + "epoch": 0.1, + "grad_norm": 4.442883491516113, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.7712, + "step": 140 + }, + { + "epoch": 0.10714285714285714, + "grad_norm": 3.7063419818878174, + "learning_rate": 1.5e-05, + "loss": 0.8602, + "step": 150 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 3.7267696857452393, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.6758, + "step": 160 + }, + { + "epoch": 0.12142857142857143, + "grad_norm": 3.7582225799560547, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.8091, + "step": 170 + }, + { + "epoch": 0.12857142857142856, + "grad_norm": 2.829885482788086, + "learning_rate": 1.8e-05, + "loss": 0.8014, + "step": 180 + }, + { + "epoch": 0.1357142857142857, + "grad_norm": 3.4555258750915527, + "learning_rate": 1.9e-05, + "loss": 0.5562, + "step": 190 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 3.120464563369751, + "learning_rate": 2e-05, + "loss": 0.6391, + "step": 200 + }, + { + "epoch": 0.15, + "grad_norm": 2.8185417652130127, + "learning_rate": 2.1e-05, + "loss": 0.6501, + "step": 210 + }, + { + "epoch": 0.15714285714285714, + "grad_norm": 2.7110323905944824, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.9029, + "step": 220 + }, + { + "epoch": 0.16428571428571428, + "grad_norm": 3.336864709854126, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.6938, + "step": 230 + }, + { + "epoch": 0.17142857142857143, + "grad_norm": 2.9769392013549805, + "learning_rate": 2.4e-05, + "loss": 0.6322, + "step": 240 + }, + { + "epoch": 0.17857142857142858, + "grad_norm": 2.5426135063171387, + "learning_rate": 2.5e-05, + "loss": 0.752, + "step": 250 + }, + { + "epoch": 0.18571428571428572, + "grad_norm": 3.2473714351654053, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.5993, + "step": 260 + }, + { + "epoch": 0.19285714285714287, + "grad_norm": 2.9979186058044434, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.5928, + "step": 270 + }, + { + "epoch": 0.2, + "grad_norm": 3.1635003089904785, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.4335, + "step": 280 + }, + { + "epoch": 0.20714285714285716, + "grad_norm": 4.114761829376221, + "learning_rate": 2.9e-05, + "loss": 0.5023, + "step": 290 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 2.0567097663879395, + "learning_rate": 3e-05, + "loss": 0.5124, + "step": 300 + }, + { + "epoch": 0.22142857142857142, + "grad_norm": 3.0209622383117676, + "learning_rate": 3.1e-05, + "loss": 0.5092, + "step": 310 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 1.8497462272644043, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.6075, + "step": 320 + }, + { + "epoch": 0.2357142857142857, + "grad_norm": 1.6237268447875977, + "learning_rate": 3.3e-05, + "loss": 0.5343, + "step": 330 + }, + { + "epoch": 0.24285714285714285, + "grad_norm": 2.9820289611816406, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.5516, + "step": 340 + }, + { + "epoch": 0.25, + "grad_norm": 1.676515817642212, + "learning_rate": 3.5e-05, + "loss": 0.6621, + "step": 350 + }, + { + "epoch": 0.2571428571428571, + "grad_norm": 3.4376354217529297, + "learning_rate": 3.6e-05, + "loss": 0.4768, + "step": 360 + }, + { + "epoch": 0.2642857142857143, + "grad_norm": 3.2355964183807373, + "learning_rate": 3.7e-05, + "loss": 0.6184, + "step": 370 + }, + { + "epoch": 0.2714285714285714, + "grad_norm": 2.2971713542938232, + "learning_rate": 3.8e-05, + "loss": 0.7827, + "step": 380 + }, + { + "epoch": 0.2785714285714286, + "grad_norm": 2.442052125930786, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.5901, + "step": 390 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 2.3172521591186523, + "learning_rate": 4e-05, + "loss": 0.6067, + "step": 400 + }, + { + "epoch": 0.29285714285714287, + "grad_norm": 2.06640887260437, + "learning_rate": 4.1e-05, + "loss": 0.6589, + "step": 410 + }, + { + "epoch": 0.3, + "grad_norm": 2.416149854660034, + "learning_rate": 4.2e-05, + "loss": 0.6489, + "step": 420 + }, + { + "epoch": 0.30714285714285716, + "grad_norm": 2.340235471725464, + "learning_rate": 4.3e-05, + "loss": 0.8339, + "step": 430 + }, + { + "epoch": 0.3142857142857143, + "grad_norm": 1.9825040102005005, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.7415, + "step": 440 + }, + { + "epoch": 0.32142857142857145, + "grad_norm": 1.8823323249816895, + "learning_rate": 4.5e-05, + "loss": 0.611, + "step": 450 + }, + { + "epoch": 0.32857142857142857, + "grad_norm": 2.3207123279571533, + "learning_rate": 4.600000000000001e-05, + "loss": 0.6172, + "step": 460 + }, + { + "epoch": 0.3357142857142857, + "grad_norm": 1.2963736057281494, + "learning_rate": 4.7e-05, + "loss": 0.4114, + "step": 470 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 2.191009044647217, + "learning_rate": 4.8e-05, + "loss": 0.6251, + "step": 480 + }, + { + "epoch": 0.35, + "grad_norm": 2.1893374919891357, + "learning_rate": 4.9e-05, + "loss": 0.5966, + "step": 490 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 2.214414596557617, + "learning_rate": 5e-05, + "loss": 0.7495, + "step": 500 + }, + { + "epoch": 0.36428571428571427, + "grad_norm": 1.8343987464904785, + "learning_rate": 4.9962962962962964e-05, + "loss": 0.6179, + "step": 510 + }, + { + "epoch": 0.37142857142857144, + "grad_norm": 2.675177574157715, + "learning_rate": 4.9925925925925926e-05, + "loss": 0.7262, + "step": 520 + }, + { + "epoch": 0.37857142857142856, + "grad_norm": 1.7133303880691528, + "learning_rate": 4.9888888888888894e-05, + "loss": 0.546, + "step": 530 + }, + { + "epoch": 0.38571428571428573, + "grad_norm": 1.4926049709320068, + "learning_rate": 4.9851851851851855e-05, + "loss": 0.4057, + "step": 540 + }, + { + "epoch": 0.39285714285714285, + "grad_norm": 2.0434134006500244, + "learning_rate": 4.981481481481482e-05, + "loss": 1.014, + "step": 550 + }, + { + "epoch": 0.4, + "grad_norm": 2.037074089050293, + "learning_rate": 4.977777777777778e-05, + "loss": 0.6088, + "step": 560 + }, + { + "epoch": 0.40714285714285714, + "grad_norm": 2.6729607582092285, + "learning_rate": 4.974074074074074e-05, + "loss": 0.5986, + "step": 570 + }, + { + "epoch": 0.4142857142857143, + "grad_norm": 1.8161852359771729, + "learning_rate": 4.970370370370371e-05, + "loss": 0.4681, + "step": 580 + }, + { + "epoch": 0.42142857142857143, + "grad_norm": 2.140554666519165, + "learning_rate": 4.966666666666667e-05, + "loss": 0.5865, + "step": 590 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 2.3027639389038086, + "learning_rate": 4.962962962962963e-05, + "loss": 0.4166, + "step": 600 + }, + { + "epoch": 0.4357142857142857, + "grad_norm": 2.973132371902466, + "learning_rate": 4.959259259259259e-05, + "loss": 0.6394, + "step": 610 + }, + { + "epoch": 0.44285714285714284, + "grad_norm": 2.898897886276245, + "learning_rate": 4.955555555555556e-05, + "loss": 0.5572, + "step": 620 + }, + { + "epoch": 0.45, + "grad_norm": 2.100752353668213, + "learning_rate": 4.951851851851852e-05, + "loss": 0.4788, + "step": 630 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 2.4735984802246094, + "learning_rate": 4.9481481481481485e-05, + "loss": 0.756, + "step": 640 + }, + { + "epoch": 0.4642857142857143, + "grad_norm": 1.5895243883132935, + "learning_rate": 4.9444444444444446e-05, + "loss": 0.5265, + "step": 650 + }, + { + "epoch": 0.4714285714285714, + "grad_norm": 2.067650079727173, + "learning_rate": 4.940740740740741e-05, + "loss": 0.6079, + "step": 660 + }, + { + "epoch": 0.4785714285714286, + "grad_norm": 1.6676874160766602, + "learning_rate": 4.937037037037037e-05, + "loss": 0.5196, + "step": 670 + }, + { + "epoch": 0.4857142857142857, + "grad_norm": 1.6084502935409546, + "learning_rate": 4.933333333333334e-05, + "loss": 0.431, + "step": 680 + }, + { + "epoch": 0.4928571428571429, + "grad_norm": 2.8858065605163574, + "learning_rate": 4.92962962962963e-05, + "loss": 0.5329, + "step": 690 + }, + { + "epoch": 0.5, + "grad_norm": 2.657158613204956, + "learning_rate": 4.925925925925926e-05, + "loss": 0.5092, + "step": 700 + }, + { + "epoch": 0.5071428571428571, + "grad_norm": 2.636237144470215, + "learning_rate": 4.922222222222222e-05, + "loss": 0.4074, + "step": 710 + }, + { + "epoch": 0.5142857142857142, + "grad_norm": 2.5960123538970947, + "learning_rate": 4.918518518518519e-05, + "loss": 0.424, + "step": 720 + }, + { + "epoch": 0.5214285714285715, + "grad_norm": 1.7363989353179932, + "learning_rate": 4.9148148148148145e-05, + "loss": 0.7226, + "step": 730 + }, + { + "epoch": 0.5285714285714286, + "grad_norm": 2.8367726802825928, + "learning_rate": 4.9111111111111114e-05, + "loss": 0.5535, + "step": 740 + }, + { + "epoch": 0.5357142857142857, + "grad_norm": 2.1372838020324707, + "learning_rate": 4.9074074074074075e-05, + "loss": 0.6546, + "step": 750 + }, + { + "epoch": 0.5428571428571428, + "grad_norm": 1.9456530809402466, + "learning_rate": 4.903703703703704e-05, + "loss": 0.5718, + "step": 760 + }, + { + "epoch": 0.55, + "grad_norm": 3.146430015563965, + "learning_rate": 4.9e-05, + "loss": 0.486, + "step": 770 + }, + { + "epoch": 0.5571428571428572, + "grad_norm": 1.633537769317627, + "learning_rate": 4.896296296296297e-05, + "loss": 0.4602, + "step": 780 + }, + { + "epoch": 0.5642857142857143, + "grad_norm": 3.580615282058716, + "learning_rate": 4.892592592592593e-05, + "loss": 0.5991, + "step": 790 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 2.354482889175415, + "learning_rate": 4.888888888888889e-05, + "loss": 0.639, + "step": 800 + }, + { + "epoch": 0.5785714285714286, + "grad_norm": 1.701165795326233, + "learning_rate": 4.885185185185185e-05, + "loss": 0.4895, + "step": 810 + }, + { + "epoch": 0.5857142857142857, + "grad_norm": 1.7530277967453003, + "learning_rate": 4.881481481481482e-05, + "loss": 0.5029, + "step": 820 + }, + { + "epoch": 0.5928571428571429, + "grad_norm": 1.4377954006195068, + "learning_rate": 4.8777777777777775e-05, + "loss": 0.4668, + "step": 830 + }, + { + "epoch": 0.6, + "grad_norm": 1.9733954668045044, + "learning_rate": 4.874074074074074e-05, + "loss": 0.6434, + "step": 840 + }, + { + "epoch": 0.6071428571428571, + "grad_norm": 1.5659862756729126, + "learning_rate": 4.8703703703703704e-05, + "loss": 0.4719, + "step": 850 + }, + { + "epoch": 0.6142857142857143, + "grad_norm": 1.9549959897994995, + "learning_rate": 4.866666666666667e-05, + "loss": 0.5003, + "step": 860 + }, + { + "epoch": 0.6214285714285714, + "grad_norm": 2.0998220443725586, + "learning_rate": 4.862962962962963e-05, + "loss": 0.4666, + "step": 870 + }, + { + "epoch": 0.6285714285714286, + "grad_norm": 1.6551822423934937, + "learning_rate": 4.8592592592592596e-05, + "loss": 0.5508, + "step": 880 + }, + { + "epoch": 0.6357142857142857, + "grad_norm": 2.268826723098755, + "learning_rate": 4.855555555555556e-05, + "loss": 0.5333, + "step": 890 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 2.170297861099243, + "learning_rate": 4.851851851851852e-05, + "loss": 0.4724, + "step": 900 + }, + { + "epoch": 0.65, + "grad_norm": 2.3737900257110596, + "learning_rate": 4.848148148148148e-05, + "loss": 0.5938, + "step": 910 + }, + { + "epoch": 0.6571428571428571, + "grad_norm": 1.5697389841079712, + "learning_rate": 4.844444444444445e-05, + "loss": 0.357, + "step": 920 + }, + { + "epoch": 0.6642857142857143, + "grad_norm": 1.4354273080825806, + "learning_rate": 4.840740740740741e-05, + "loss": 0.2648, + "step": 930 + }, + { + "epoch": 0.6714285714285714, + "grad_norm": 1.1631938219070435, + "learning_rate": 4.837037037037037e-05, + "loss": 0.4647, + "step": 940 + }, + { + "epoch": 0.6785714285714286, + "grad_norm": 2.594999313354492, + "learning_rate": 4.8333333333333334e-05, + "loss": 0.6831, + "step": 950 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 2.2979557514190674, + "learning_rate": 4.82962962962963e-05, + "loss": 0.4363, + "step": 960 + }, + { + "epoch": 0.6928571428571428, + "grad_norm": 3.0777416229248047, + "learning_rate": 4.825925925925926e-05, + "loss": 0.5995, + "step": 970 + }, + { + "epoch": 0.7, + "grad_norm": 2.430807113647461, + "learning_rate": 4.8222222222222225e-05, + "loss": 0.6433, + "step": 980 + }, + { + "epoch": 0.7071428571428572, + "grad_norm": 1.7465846538543701, + "learning_rate": 4.818518518518519e-05, + "loss": 0.4973, + "step": 990 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 2.805053472518921, + "learning_rate": 4.814814814814815e-05, + "loss": 0.556, + "step": 1000 + }, + { + "epoch": 0.7142857142857143, + "eval_loss": 0.4978465139865875, + "eval_rouge1": 0.8844, + "eval_rouge2": 0.8183, + "eval_rougeL": 0.8811, + "eval_runtime": 122.1438, + "eval_samples_per_second": 11.462, + "eval_steps_per_second": 5.731, + "step": 1000 + }, + { + "epoch": 0.7214285714285714, + "grad_norm": 1.6127879619598389, + "learning_rate": 4.811111111111111e-05, + "loss": 0.5748, + "step": 1010 + }, + { + "epoch": 0.7285714285714285, + "grad_norm": 1.1071356534957886, + "learning_rate": 4.807407407407408e-05, + "loss": 0.5246, + "step": 1020 + }, + { + "epoch": 0.7357142857142858, + "grad_norm": 1.9362713098526, + "learning_rate": 4.803703703703704e-05, + "loss": 0.563, + "step": 1030 + }, + { + "epoch": 0.7428571428571429, + "grad_norm": 2.037553548812866, + "learning_rate": 4.8e-05, + "loss": 0.466, + "step": 1040 + }, + { + "epoch": 0.75, + "grad_norm": 2.196617841720581, + "learning_rate": 4.796296296296296e-05, + "loss": 0.5647, + "step": 1050 + }, + { + "epoch": 0.7571428571428571, + "grad_norm": 1.278428077697754, + "learning_rate": 4.792592592592593e-05, + "loss": 0.4821, + "step": 1060 + }, + { + "epoch": 0.7642857142857142, + "grad_norm": 1.3506104946136475, + "learning_rate": 4.7888888888888886e-05, + "loss": 0.5194, + "step": 1070 + }, + { + "epoch": 0.7714285714285715, + "grad_norm": 2.3870656490325928, + "learning_rate": 4.7851851851851854e-05, + "loss": 0.7373, + "step": 1080 + }, + { + "epoch": 0.7785714285714286, + "grad_norm": 2.071242094039917, + "learning_rate": 4.7814814814814816e-05, + "loss": 0.5598, + "step": 1090 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 1.8460086584091187, + "learning_rate": 4.7777777777777784e-05, + "loss": 0.6184, + "step": 1100 + }, + { + "epoch": 0.7928571428571428, + "grad_norm": 3.804724931716919, + "learning_rate": 4.774074074074074e-05, + "loss": 0.5978, + "step": 1110 + }, + { + "epoch": 0.8, + "grad_norm": 2.614772081375122, + "learning_rate": 4.770370370370371e-05, + "loss": 0.6203, + "step": 1120 + }, + { + "epoch": 0.8071428571428572, + "grad_norm": 2.068122386932373, + "learning_rate": 4.766666666666667e-05, + "loss": 0.6149, + "step": 1130 + }, + { + "epoch": 0.8142857142857143, + "grad_norm": 1.675881266593933, + "learning_rate": 4.762962962962963e-05, + "loss": 0.4437, + "step": 1140 + }, + { + "epoch": 0.8214285714285714, + "grad_norm": 1.865435004234314, + "learning_rate": 4.759259259259259e-05, + "loss": 0.5166, + "step": 1150 + }, + { + "epoch": 0.8285714285714286, + "grad_norm": 1.0480509996414185, + "learning_rate": 4.755555555555556e-05, + "loss": 0.3827, + "step": 1160 + }, + { + "epoch": 0.8357142857142857, + "grad_norm": 2.218554735183716, + "learning_rate": 4.751851851851852e-05, + "loss": 0.6641, + "step": 1170 + }, + { + "epoch": 0.8428571428571429, + "grad_norm": 2.510831832885742, + "learning_rate": 4.7481481481481483e-05, + "loss": 0.496, + "step": 1180 + }, + { + "epoch": 0.85, + "grad_norm": 1.8328824043273926, + "learning_rate": 4.7444444444444445e-05, + "loss": 0.5466, + "step": 1190 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.5480728149414062, + "learning_rate": 4.740740740740741e-05, + "loss": 0.4992, + "step": 1200 + }, + { + "epoch": 0.8642857142857143, + "grad_norm": 1.3723492622375488, + "learning_rate": 4.737037037037037e-05, + "loss": 0.5014, + "step": 1210 + }, + { + "epoch": 0.8714285714285714, + "grad_norm": 1.7510666847229004, + "learning_rate": 4.7333333333333336e-05, + "loss": 0.5471, + "step": 1220 + }, + { + "epoch": 0.8785714285714286, + "grad_norm": 2.2057995796203613, + "learning_rate": 4.72962962962963e-05, + "loss": 0.6142, + "step": 1230 + }, + { + "epoch": 0.8857142857142857, + "grad_norm": 1.7922954559326172, + "learning_rate": 4.7259259259259266e-05, + "loss": 0.5199, + "step": 1240 + }, + { + "epoch": 0.8928571428571429, + "grad_norm": 1.9541053771972656, + "learning_rate": 4.722222222222222e-05, + "loss": 0.44, + "step": 1250 + }, + { + "epoch": 0.9, + "grad_norm": 1.2869590520858765, + "learning_rate": 4.718518518518519e-05, + "loss": 0.5157, + "step": 1260 + }, + { + "epoch": 0.9071428571428571, + "grad_norm": 1.7564722299575806, + "learning_rate": 4.714814814814815e-05, + "loss": 0.4985, + "step": 1270 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 1.0782675743103027, + "learning_rate": 4.711111111111111e-05, + "loss": 0.3195, + "step": 1280 + }, + { + "epoch": 0.9214285714285714, + "grad_norm": 1.7535449266433716, + "learning_rate": 4.7074074074074074e-05, + "loss": 0.376, + "step": 1290 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 1.581485629081726, + "learning_rate": 4.703703703703704e-05, + "loss": 0.5975, + "step": 1300 + }, + { + "epoch": 0.9357142857142857, + "grad_norm": 2.739900827407837, + "learning_rate": 4.7e-05, + "loss": 0.457, + "step": 1310 + }, + { + "epoch": 0.9428571428571428, + "grad_norm": 2.382187604904175, + "learning_rate": 4.6962962962962966e-05, + "loss": 0.5424, + "step": 1320 + }, + { + "epoch": 0.95, + "grad_norm": 1.75946843624115, + "learning_rate": 4.692592592592593e-05, + "loss": 0.3563, + "step": 1330 + }, + { + "epoch": 0.9571428571428572, + "grad_norm": 1.8159079551696777, + "learning_rate": 4.6888888888888895e-05, + "loss": 0.4502, + "step": 1340 + }, + { + "epoch": 0.9642857142857143, + "grad_norm": 2.605283260345459, + "learning_rate": 4.685185185185185e-05, + "loss": 0.4779, + "step": 1350 + }, + { + "epoch": 0.9714285714285714, + "grad_norm": 2.594231605529785, + "learning_rate": 4.681481481481482e-05, + "loss": 0.4901, + "step": 1360 + }, + { + "epoch": 0.9785714285714285, + "grad_norm": 2.109367609024048, + "learning_rate": 4.677777777777778e-05, + "loss": 0.5378, + "step": 1370 + }, + { + "epoch": 0.9857142857142858, + "grad_norm": 1.960496425628662, + "learning_rate": 4.674074074074074e-05, + "loss": 0.6129, + "step": 1380 + }, + { + "epoch": 0.9928571428571429, + "grad_norm": 3.4135870933532715, + "learning_rate": 4.67037037037037e-05, + "loss": 0.7069, + "step": 1390 + }, + { + "epoch": 1.0, + "grad_norm": 1.441308617591858, + "learning_rate": 4.666666666666667e-05, + "loss": 0.4686, + "step": 1400 + }, + { + "epoch": 1.0071428571428571, + "grad_norm": 1.9842432737350464, + "learning_rate": 4.662962962962963e-05, + "loss": 0.604, + "step": 1410 + }, + { + "epoch": 1.0142857142857142, + "grad_norm": 1.3867950439453125, + "learning_rate": 4.6592592592592595e-05, + "loss": 0.4168, + "step": 1420 + }, + { + "epoch": 1.0214285714285714, + "grad_norm": 2.118037462234497, + "learning_rate": 4.6555555555555556e-05, + "loss": 0.6484, + "step": 1430 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 1.4064522981643677, + "learning_rate": 4.6518518518518525e-05, + "loss": 0.5275, + "step": 1440 + }, + { + "epoch": 1.0357142857142858, + "grad_norm": 2.644491672515869, + "learning_rate": 4.648148148148148e-05, + "loss": 0.5361, + "step": 1450 + }, + { + "epoch": 1.042857142857143, + "grad_norm": 1.4005937576293945, + "learning_rate": 4.644444444444445e-05, + "loss": 0.4497, + "step": 1460 + }, + { + "epoch": 1.05, + "grad_norm": 1.773334264755249, + "learning_rate": 4.640740740740741e-05, + "loss": 0.4372, + "step": 1470 + }, + { + "epoch": 1.0571428571428572, + "grad_norm": 2.1667587757110596, + "learning_rate": 4.637037037037038e-05, + "loss": 0.5211, + "step": 1480 + }, + { + "epoch": 1.0642857142857143, + "grad_norm": 1.1993277072906494, + "learning_rate": 4.633333333333333e-05, + "loss": 0.3694, + "step": 1490 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 1.5526480674743652, + "learning_rate": 4.62962962962963e-05, + "loss": 0.686, + "step": 1500 + }, + { + "epoch": 1.0785714285714285, + "grad_norm": 1.5041449069976807, + "learning_rate": 4.625925925925926e-05, + "loss": 0.4536, + "step": 1510 + }, + { + "epoch": 1.0857142857142856, + "grad_norm": 1.719254970550537, + "learning_rate": 4.6222222222222224e-05, + "loss": 0.4687, + "step": 1520 + }, + { + "epoch": 1.092857142857143, + "grad_norm": 1.9565083980560303, + "learning_rate": 4.6185185185185185e-05, + "loss": 0.4054, + "step": 1530 + }, + { + "epoch": 1.1, + "grad_norm": 1.2271467447280884, + "learning_rate": 4.6148148148148154e-05, + "loss": 0.4189, + "step": 1540 + }, + { + "epoch": 1.1071428571428572, + "grad_norm": 1.731244683265686, + "learning_rate": 4.6111111111111115e-05, + "loss": 0.4519, + "step": 1550 + }, + { + "epoch": 1.1142857142857143, + "grad_norm": 1.3039075136184692, + "learning_rate": 4.607407407407408e-05, + "loss": 0.3911, + "step": 1560 + }, + { + "epoch": 1.1214285714285714, + "grad_norm": 1.3420417308807373, + "learning_rate": 4.603703703703704e-05, + "loss": 0.4239, + "step": 1570 + }, + { + "epoch": 1.1285714285714286, + "grad_norm": 2.2307205200195312, + "learning_rate": 4.600000000000001e-05, + "loss": 0.4675, + "step": 1580 + }, + { + "epoch": 1.1357142857142857, + "grad_norm": 2.384147882461548, + "learning_rate": 4.596296296296296e-05, + "loss": 0.3963, + "step": 1590 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 1.6016713380813599, + "learning_rate": 4.592592592592593e-05, + "loss": 0.4561, + "step": 1600 + }, + { + "epoch": 1.15, + "grad_norm": 1.4093197584152222, + "learning_rate": 4.588888888888889e-05, + "loss": 0.4708, + "step": 1610 + }, + { + "epoch": 1.157142857142857, + "grad_norm": 1.9773272275924683, + "learning_rate": 4.585185185185185e-05, + "loss": 0.5259, + "step": 1620 + }, + { + "epoch": 1.1642857142857144, + "grad_norm": 1.169757604598999, + "learning_rate": 4.5814814814814815e-05, + "loss": 0.3413, + "step": 1630 + }, + { + "epoch": 1.1714285714285715, + "grad_norm": 2.1033947467803955, + "learning_rate": 4.577777777777778e-05, + "loss": 0.4888, + "step": 1640 + }, + { + "epoch": 1.1785714285714286, + "grad_norm": 1.2455283403396606, + "learning_rate": 4.5740740740740745e-05, + "loss": 0.5935, + "step": 1650 + }, + { + "epoch": 1.1857142857142857, + "grad_norm": 1.283308982849121, + "learning_rate": 4.5703703703703706e-05, + "loss": 0.3946, + "step": 1660 + }, + { + "epoch": 1.1928571428571428, + "grad_norm": 1.9639955759048462, + "learning_rate": 4.566666666666667e-05, + "loss": 0.519, + "step": 1670 + }, + { + "epoch": 1.2, + "grad_norm": 0.9380689263343811, + "learning_rate": 4.5629629629629636e-05, + "loss": 0.3357, + "step": 1680 + }, + { + "epoch": 1.207142857142857, + "grad_norm": 2.330310344696045, + "learning_rate": 4.559259259259259e-05, + "loss": 0.5135, + "step": 1690 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 1.5911920070648193, + "learning_rate": 4.555555555555556e-05, + "loss": 0.4165, + "step": 1700 + }, + { + "epoch": 1.2214285714285715, + "grad_norm": 1.7522234916687012, + "learning_rate": 4.551851851851852e-05, + "loss": 0.5797, + "step": 1710 + }, + { + "epoch": 1.2285714285714286, + "grad_norm": 2.265571355819702, + "learning_rate": 4.548148148148149e-05, + "loss": 0.3943, + "step": 1720 + }, + { + "epoch": 1.2357142857142858, + "grad_norm": 2.530675172805786, + "learning_rate": 4.5444444444444444e-05, + "loss": 0.6279, + "step": 1730 + }, + { + "epoch": 1.2428571428571429, + "grad_norm": 2.072864055633545, + "learning_rate": 4.540740740740741e-05, + "loss": 0.512, + "step": 1740 + }, + { + "epoch": 1.25, + "grad_norm": 1.5505369901657104, + "learning_rate": 4.5370370370370374e-05, + "loss": 0.3494, + "step": 1750 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 1.9888116121292114, + "learning_rate": 4.5333333333333335e-05, + "loss": 0.5841, + "step": 1760 + }, + { + "epoch": 1.2642857142857142, + "grad_norm": 1.6056774854660034, + "learning_rate": 4.52962962962963e-05, + "loss": 0.5611, + "step": 1770 + }, + { + "epoch": 1.2714285714285714, + "grad_norm": 1.7950221300125122, + "learning_rate": 4.5259259259259265e-05, + "loss": 0.6097, + "step": 1780 + }, + { + "epoch": 1.2785714285714285, + "grad_norm": 1.8906399011611938, + "learning_rate": 4.522222222222223e-05, + "loss": 0.4837, + "step": 1790 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 1.3988184928894043, + "learning_rate": 4.518518518518519e-05, + "loss": 0.4806, + "step": 1800 + }, + { + "epoch": 1.292857142857143, + "grad_norm": 1.160243272781372, + "learning_rate": 4.514814814814815e-05, + "loss": 0.5224, + "step": 1810 + }, + { + "epoch": 1.3, + "grad_norm": 1.0152113437652588, + "learning_rate": 4.511111111111112e-05, + "loss": 0.4115, + "step": 1820 + }, + { + "epoch": 1.3071428571428572, + "grad_norm": 1.6176999807357788, + "learning_rate": 4.507407407407407e-05, + "loss": 0.4458, + "step": 1830 + }, + { + "epoch": 1.3142857142857143, + "grad_norm": 1.904784917831421, + "learning_rate": 4.503703703703704e-05, + "loss": 0.5552, + "step": 1840 + }, + { + "epoch": 1.3214285714285714, + "grad_norm": 1.0539710521697998, + "learning_rate": 4.5e-05, + "loss": 0.438, + "step": 1850 + }, + { + "epoch": 1.3285714285714285, + "grad_norm": 1.3552178144454956, + "learning_rate": 4.496296296296297e-05, + "loss": 0.2862, + "step": 1860 + }, + { + "epoch": 1.3357142857142856, + "grad_norm": 1.3787767887115479, + "learning_rate": 4.4925925925925926e-05, + "loss": 0.5173, + "step": 1870 + }, + { + "epoch": 1.342857142857143, + "grad_norm": 2.570422649383545, + "learning_rate": 4.4888888888888894e-05, + "loss": 0.4581, + "step": 1880 + }, + { + "epoch": 1.35, + "grad_norm": 1.5974104404449463, + "learning_rate": 4.4851851851851856e-05, + "loss": 0.4599, + "step": 1890 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 1.4105775356292725, + "learning_rate": 4.481481481481482e-05, + "loss": 0.3823, + "step": 1900 + }, + { + "epoch": 1.3642857142857143, + "grad_norm": 2.1751532554626465, + "learning_rate": 4.477777777777778e-05, + "loss": 0.4421, + "step": 1910 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 1.9956297874450684, + "learning_rate": 4.474074074074075e-05, + "loss": 0.4082, + "step": 1920 + }, + { + "epoch": 1.3785714285714286, + "grad_norm": 1.6159803867340088, + "learning_rate": 4.47037037037037e-05, + "loss": 0.3961, + "step": 1930 + }, + { + "epoch": 1.3857142857142857, + "grad_norm": 1.4909430742263794, + "learning_rate": 4.466666666666667e-05, + "loss": 0.4635, + "step": 1940 + }, + { + "epoch": 1.3928571428571428, + "grad_norm": 1.5630055665969849, + "learning_rate": 4.462962962962963e-05, + "loss": 0.5968, + "step": 1950 + }, + { + "epoch": 1.4, + "grad_norm": 1.2496933937072754, + "learning_rate": 4.4592592592592594e-05, + "loss": 0.4546, + "step": 1960 + }, + { + "epoch": 1.407142857142857, + "grad_norm": 1.6497224569320679, + "learning_rate": 4.4555555555555555e-05, + "loss": 0.354, + "step": 1970 + }, + { + "epoch": 1.4142857142857144, + "grad_norm": 2.069955587387085, + "learning_rate": 4.4518518518518523e-05, + "loss": 0.4388, + "step": 1980 + }, + { + "epoch": 1.4214285714285715, + "grad_norm": 1.6338075399398804, + "learning_rate": 4.4481481481481485e-05, + "loss": 0.5459, + "step": 1990 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 1.3558902740478516, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.3139, + "step": 2000 + }, + { + "epoch": 1.4285714285714286, + "eval_loss": 0.43829917907714844, + "eval_rouge1": 0.8911, + "eval_rouge2": 0.8251, + "eval_rougeL": 0.8873, + "eval_runtime": 121.8873, + "eval_samples_per_second": 11.486, + "eval_steps_per_second": 5.743, + "step": 2000 + }, + { + "epoch": 1.4357142857142857, + "grad_norm": 1.8123821020126343, + "learning_rate": 4.440740740740741e-05, + "loss": 0.5864, + "step": 2010 + }, + { + "epoch": 1.4428571428571428, + "grad_norm": 0.8494770526885986, + "learning_rate": 4.4370370370370376e-05, + "loss": 0.3284, + "step": 2020 + }, + { + "epoch": 1.45, + "grad_norm": 2.2536141872406006, + "learning_rate": 4.433333333333334e-05, + "loss": 0.3738, + "step": 2030 + }, + { + "epoch": 1.457142857142857, + "grad_norm": 2.971925735473633, + "learning_rate": 4.42962962962963e-05, + "loss": 0.5294, + "step": 2040 + }, + { + "epoch": 1.4642857142857144, + "grad_norm": 1.7820425033569336, + "learning_rate": 4.425925925925926e-05, + "loss": 0.505, + "step": 2050 + }, + { + "epoch": 1.4714285714285715, + "grad_norm": 1.196044683456421, + "learning_rate": 4.422222222222222e-05, + "loss": 0.3921, + "step": 2060 + }, + { + "epoch": 1.4785714285714286, + "grad_norm": 0.9053621888160706, + "learning_rate": 4.4185185185185184e-05, + "loss": 0.2386, + "step": 2070 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 1.8388108015060425, + "learning_rate": 4.414814814814815e-05, + "loss": 0.4309, + "step": 2080 + }, + { + "epoch": 1.4928571428571429, + "grad_norm": 2.25136137008667, + "learning_rate": 4.4111111111111114e-05, + "loss": 0.3918, + "step": 2090 + }, + { + "epoch": 1.5, + "grad_norm": 1.960864782333374, + "learning_rate": 4.4074074074074076e-05, + "loss": 0.4754, + "step": 2100 + }, + { + "epoch": 1.5071428571428571, + "grad_norm": 2.4653213024139404, + "learning_rate": 4.403703703703704e-05, + "loss": 0.4545, + "step": 2110 + }, + { + "epoch": 1.5142857142857142, + "grad_norm": 1.8694462776184082, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.3707, + "step": 2120 + }, + { + "epoch": 1.5214285714285714, + "grad_norm": 2.240447521209717, + "learning_rate": 4.396296296296297e-05, + "loss": 0.5174, + "step": 2130 + }, + { + "epoch": 1.5285714285714285, + "grad_norm": 0.8589600920677185, + "learning_rate": 4.392592592592593e-05, + "loss": 0.3802, + "step": 2140 + }, + { + "epoch": 1.5357142857142856, + "grad_norm": 2.495075225830078, + "learning_rate": 4.388888888888889e-05, + "loss": 0.4735, + "step": 2150 + }, + { + "epoch": 1.5428571428571427, + "grad_norm": 1.5384397506713867, + "learning_rate": 4.385185185185185e-05, + "loss": 0.4844, + "step": 2160 + }, + { + "epoch": 1.55, + "grad_norm": 1.2120758295059204, + "learning_rate": 4.381481481481482e-05, + "loss": 0.3029, + "step": 2170 + }, + { + "epoch": 1.5571428571428572, + "grad_norm": 2.0210671424865723, + "learning_rate": 4.377777777777778e-05, + "loss": 0.688, + "step": 2180 + }, + { + "epoch": 1.5642857142857143, + "grad_norm": 2.322322368621826, + "learning_rate": 4.374074074074074e-05, + "loss": 0.4673, + "step": 2190 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 1.8948960304260254, + "learning_rate": 4.3703703703703705e-05, + "loss": 0.3698, + "step": 2200 + }, + { + "epoch": 1.5785714285714287, + "grad_norm": 1.776141881942749, + "learning_rate": 4.3666666666666666e-05, + "loss": 0.3611, + "step": 2210 + }, + { + "epoch": 1.5857142857142859, + "grad_norm": 2.8628015518188477, + "learning_rate": 4.3629629629629635e-05, + "loss": 0.4504, + "step": 2220 + }, + { + "epoch": 1.592857142857143, + "grad_norm": 1.8579275608062744, + "learning_rate": 4.3592592592592596e-05, + "loss": 0.5131, + "step": 2230 + }, + { + "epoch": 1.6, + "grad_norm": 1.1070181131362915, + "learning_rate": 4.355555555555556e-05, + "loss": 0.4187, + "step": 2240 + }, + { + "epoch": 1.6071428571428572, + "grad_norm": 1.3833059072494507, + "learning_rate": 4.351851851851852e-05, + "loss": 0.4301, + "step": 2250 + }, + { + "epoch": 1.6142857142857143, + "grad_norm": 1.6870567798614502, + "learning_rate": 4.348148148148148e-05, + "loss": 0.5542, + "step": 2260 + }, + { + "epoch": 1.6214285714285714, + "grad_norm": 1.582582712173462, + "learning_rate": 4.344444444444445e-05, + "loss": 0.5192, + "step": 2270 + }, + { + "epoch": 1.6285714285714286, + "grad_norm": 3.3700509071350098, + "learning_rate": 4.340740740740741e-05, + "loss": 0.3268, + "step": 2280 + }, + { + "epoch": 1.6357142857142857, + "grad_norm": 3.0057899951934814, + "learning_rate": 4.337037037037037e-05, + "loss": 0.3787, + "step": 2290 + }, + { + "epoch": 1.6428571428571428, + "grad_norm": 1.302416205406189, + "learning_rate": 4.3333333333333334e-05, + "loss": 0.4793, + "step": 2300 + }, + { + "epoch": 1.65, + "grad_norm": 2.42720103263855, + "learning_rate": 4.3296296296296296e-05, + "loss": 0.4124, + "step": 2310 + }, + { + "epoch": 1.657142857142857, + "grad_norm": 1.455609917640686, + "learning_rate": 4.325925925925926e-05, + "loss": 0.272, + "step": 2320 + }, + { + "epoch": 1.6642857142857141, + "grad_norm": 2.1332924365997314, + "learning_rate": 4.3222222222222226e-05, + "loss": 0.4859, + "step": 2330 + }, + { + "epoch": 1.6714285714285713, + "grad_norm": 1.977156162261963, + "learning_rate": 4.318518518518519e-05, + "loss": 0.4017, + "step": 2340 + }, + { + "epoch": 1.6785714285714286, + "grad_norm": 1.7197158336639404, + "learning_rate": 4.314814814814815e-05, + "loss": 0.376, + "step": 2350 + }, + { + "epoch": 1.6857142857142857, + "grad_norm": 0.8615891933441162, + "learning_rate": 4.311111111111111e-05, + "loss": 0.3383, + "step": 2360 + }, + { + "epoch": 1.6928571428571428, + "grad_norm": 1.2501980066299438, + "learning_rate": 4.307407407407408e-05, + "loss": 0.3999, + "step": 2370 + }, + { + "epoch": 1.7, + "grad_norm": 1.7977019548416138, + "learning_rate": 4.303703703703704e-05, + "loss": 0.3424, + "step": 2380 + }, + { + "epoch": 1.7071428571428573, + "grad_norm": 2.265807867050171, + "learning_rate": 4.3e-05, + "loss": 0.4345, + "step": 2390 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 2.348353624343872, + "learning_rate": 4.296296296296296e-05, + "loss": 0.4395, + "step": 2400 + }, + { + "epoch": 1.7214285714285715, + "grad_norm": 2.585843801498413, + "learning_rate": 4.292592592592593e-05, + "loss": 0.5208, + "step": 2410 + }, + { + "epoch": 1.7285714285714286, + "grad_norm": 1.1487417221069336, + "learning_rate": 4.2888888888888886e-05, + "loss": 0.4635, + "step": 2420 + }, + { + "epoch": 1.7357142857142858, + "grad_norm": 1.206634521484375, + "learning_rate": 4.2851851851851855e-05, + "loss": 0.3521, + "step": 2430 + }, + { + "epoch": 1.7428571428571429, + "grad_norm": 2.136702299118042, + "learning_rate": 4.2814814814814816e-05, + "loss": 0.4582, + "step": 2440 + }, + { + "epoch": 1.75, + "grad_norm": 1.2831017971038818, + "learning_rate": 4.277777777777778e-05, + "loss": 0.397, + "step": 2450 + }, + { + "epoch": 1.7571428571428571, + "grad_norm": 2.313405990600586, + "learning_rate": 4.274074074074074e-05, + "loss": 0.431, + "step": 2460 + }, + { + "epoch": 1.7642857142857142, + "grad_norm": 1.8922353982925415, + "learning_rate": 4.270370370370371e-05, + "loss": 0.4396, + "step": 2470 + }, + { + "epoch": 1.7714285714285714, + "grad_norm": 1.735303521156311, + "learning_rate": 4.266666666666667e-05, + "loss": 0.4019, + "step": 2480 + }, + { + "epoch": 1.7785714285714285, + "grad_norm": 1.1989376544952393, + "learning_rate": 4.262962962962963e-05, + "loss": 0.3317, + "step": 2490 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 1.709370732307434, + "learning_rate": 4.259259259259259e-05, + "loss": 0.4721, + "step": 2500 + }, + { + "epoch": 1.7928571428571427, + "grad_norm": 1.3655775785446167, + "learning_rate": 4.255555555555556e-05, + "loss": 0.464, + "step": 2510 + }, + { + "epoch": 1.8, + "grad_norm": 1.2292691469192505, + "learning_rate": 4.2518518518518515e-05, + "loss": 0.3765, + "step": 2520 + }, + { + "epoch": 1.8071428571428572, + "grad_norm": 2.6490797996520996, + "learning_rate": 4.2481481481481484e-05, + "loss": 0.4989, + "step": 2530 + }, + { + "epoch": 1.8142857142857143, + "grad_norm": 1.8564647436141968, + "learning_rate": 4.2444444444444445e-05, + "loss": 0.3776, + "step": 2540 + }, + { + "epoch": 1.8214285714285714, + "grad_norm": 1.9681627750396729, + "learning_rate": 4.240740740740741e-05, + "loss": 0.4717, + "step": 2550 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 2.1326770782470703, + "learning_rate": 4.237037037037037e-05, + "loss": 0.3212, + "step": 2560 + }, + { + "epoch": 1.8357142857142859, + "grad_norm": 1.8122767210006714, + "learning_rate": 4.233333333333334e-05, + "loss": 0.3619, + "step": 2570 + }, + { + "epoch": 1.842857142857143, + "grad_norm": 1.4822399616241455, + "learning_rate": 4.22962962962963e-05, + "loss": 0.4631, + "step": 2580 + }, + { + "epoch": 1.85, + "grad_norm": 2.278700828552246, + "learning_rate": 4.225925925925926e-05, + "loss": 0.3018, + "step": 2590 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 2.3148486614227295, + "learning_rate": 4.222222222222222e-05, + "loss": 0.4084, + "step": 2600 + }, + { + "epoch": 1.8642857142857143, + "grad_norm": 1.5277279615402222, + "learning_rate": 4.218518518518519e-05, + "loss": 0.5295, + "step": 2610 + }, + { + "epoch": 1.8714285714285714, + "grad_norm": 1.3603259325027466, + "learning_rate": 4.2148148148148145e-05, + "loss": 0.4727, + "step": 2620 + }, + { + "epoch": 1.8785714285714286, + "grad_norm": 1.9577744007110596, + "learning_rate": 4.211111111111111e-05, + "loss": 0.5915, + "step": 2630 + }, + { + "epoch": 1.8857142857142857, + "grad_norm": 1.0424437522888184, + "learning_rate": 4.2074074074074075e-05, + "loss": 0.3386, + "step": 2640 + }, + { + "epoch": 1.8928571428571428, + "grad_norm": 2.7555553913116455, + "learning_rate": 4.203703703703704e-05, + "loss": 0.5003, + "step": 2650 + }, + { + "epoch": 1.9, + "grad_norm": 1.9913907051086426, + "learning_rate": 4.2e-05, + "loss": 0.3875, + "step": 2660 + }, + { + "epoch": 1.907142857142857, + "grad_norm": 1.8053233623504639, + "learning_rate": 4.1962962962962966e-05, + "loss": 0.2318, + "step": 2670 + }, + { + "epoch": 1.9142857142857141, + "grad_norm": 1.7686558961868286, + "learning_rate": 4.192592592592593e-05, + "loss": 0.3772, + "step": 2680 + }, + { + "epoch": 1.9214285714285713, + "grad_norm": 1.4202839136123657, + "learning_rate": 4.188888888888889e-05, + "loss": 0.3742, + "step": 2690 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 2.7964282035827637, + "learning_rate": 4.185185185185185e-05, + "loss": 0.2901, + "step": 2700 + }, + { + "epoch": 1.9357142857142857, + "grad_norm": 2.0525360107421875, + "learning_rate": 4.181481481481482e-05, + "loss": 0.3146, + "step": 2710 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 1.937103033065796, + "learning_rate": 4.177777777777778e-05, + "loss": 0.3871, + "step": 2720 + }, + { + "epoch": 1.95, + "grad_norm": 2.1534152030944824, + "learning_rate": 4.174074074074074e-05, + "loss": 0.5309, + "step": 2730 + }, + { + "epoch": 1.9571428571428573, + "grad_norm": 1.60648512840271, + "learning_rate": 4.1703703703703704e-05, + "loss": 0.3859, + "step": 2740 + }, + { + "epoch": 1.9642857142857144, + "grad_norm": 2.2782654762268066, + "learning_rate": 4.166666666666667e-05, + "loss": 0.3859, + "step": 2750 + }, + { + "epoch": 1.9714285714285715, + "grad_norm": 1.9134782552719116, + "learning_rate": 4.162962962962963e-05, + "loss": 0.565, + "step": 2760 + }, + { + "epoch": 1.9785714285714286, + "grad_norm": 1.4029120206832886, + "learning_rate": 4.1592592592592595e-05, + "loss": 0.3373, + "step": 2770 + }, + { + "epoch": 1.9857142857142858, + "grad_norm": 1.9651641845703125, + "learning_rate": 4.155555555555556e-05, + "loss": 0.3847, + "step": 2780 + }, + { + "epoch": 1.9928571428571429, + "grad_norm": 1.4134501218795776, + "learning_rate": 4.1518518518518525e-05, + "loss": 0.4465, + "step": 2790 + }, + { + "epoch": 2.0, + "grad_norm": 1.9682196378707886, + "learning_rate": 4.148148148148148e-05, + "loss": 0.545, + "step": 2800 + }, + { + "epoch": 2.007142857142857, + "grad_norm": 2.259190559387207, + "learning_rate": 4.144444444444445e-05, + "loss": 0.3058, + "step": 2810 + }, + { + "epoch": 2.0142857142857142, + "grad_norm": 1.6861268281936646, + "learning_rate": 4.140740740740741e-05, + "loss": 0.4567, + "step": 2820 + }, + { + "epoch": 2.0214285714285714, + "grad_norm": 1.5168589353561401, + "learning_rate": 4.137037037037037e-05, + "loss": 0.5687, + "step": 2830 + }, + { + "epoch": 2.0285714285714285, + "grad_norm": 1.1756591796875, + "learning_rate": 4.133333333333333e-05, + "loss": 0.4118, + "step": 2840 + }, + { + "epoch": 2.0357142857142856, + "grad_norm": 1.4381860494613647, + "learning_rate": 4.12962962962963e-05, + "loss": 0.5473, + "step": 2850 + }, + { + "epoch": 2.0428571428571427, + "grad_norm": 1.710028052330017, + "learning_rate": 4.1259259259259256e-05, + "loss": 0.3362, + "step": 2860 + }, + { + "epoch": 2.05, + "grad_norm": 1.3261126279830933, + "learning_rate": 4.1222222222222224e-05, + "loss": 0.4497, + "step": 2870 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 1.5397872924804688, + "learning_rate": 4.1185185185185186e-05, + "loss": 0.3985, + "step": 2880 + }, + { + "epoch": 2.064285714285714, + "grad_norm": 2.1463019847869873, + "learning_rate": 4.1148148148148154e-05, + "loss": 0.3423, + "step": 2890 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 1.3202670812606812, + "learning_rate": 4.111111111111111e-05, + "loss": 0.3422, + "step": 2900 + }, + { + "epoch": 2.0785714285714287, + "grad_norm": 1.6832393407821655, + "learning_rate": 4.107407407407408e-05, + "loss": 0.3243, + "step": 2910 + }, + { + "epoch": 2.085714285714286, + "grad_norm": 1.7872291803359985, + "learning_rate": 4.103703703703704e-05, + "loss": 0.4452, + "step": 2920 + }, + { + "epoch": 2.092857142857143, + "grad_norm": 2.649644613265991, + "learning_rate": 4.1e-05, + "loss": 0.4125, + "step": 2930 + }, + { + "epoch": 2.1, + "grad_norm": 1.6862508058547974, + "learning_rate": 4.096296296296296e-05, + "loss": 0.3503, + "step": 2940 + }, + { + "epoch": 2.107142857142857, + "grad_norm": 1.0415781736373901, + "learning_rate": 4.092592592592593e-05, + "loss": 0.2464, + "step": 2950 + }, + { + "epoch": 2.1142857142857143, + "grad_norm": 1.3061981201171875, + "learning_rate": 4.088888888888889e-05, + "loss": 0.2232, + "step": 2960 + }, + { + "epoch": 2.1214285714285714, + "grad_norm": 1.0442795753479004, + "learning_rate": 4.0851851851851853e-05, + "loss": 0.3512, + "step": 2970 + }, + { + "epoch": 2.1285714285714286, + "grad_norm": 2.4381885528564453, + "learning_rate": 4.0814814814814815e-05, + "loss": 0.4678, + "step": 2980 + }, + { + "epoch": 2.1357142857142857, + "grad_norm": 1.5145925283432007, + "learning_rate": 4.0777777777777783e-05, + "loss": 0.3655, + "step": 2990 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 3.514470338821411, + "learning_rate": 4.074074074074074e-05, + "loss": 0.3538, + "step": 3000 + }, + { + "epoch": 2.142857142857143, + "eval_loss": 0.42481884360313416, + "eval_rouge1": 0.8943, + "eval_rouge2": 0.8295, + "eval_rougeL": 0.8911, + "eval_runtime": 122.0137, + "eval_samples_per_second": 11.474, + "eval_steps_per_second": 5.737, + "step": 3000 + }, + { + "epoch": 2.15, + "grad_norm": 1.9778640270233154, + "learning_rate": 4.0703703703703707e-05, + "loss": 0.4804, + "step": 3010 + }, + { + "epoch": 2.157142857142857, + "grad_norm": 2.8497660160064697, + "learning_rate": 4.066666666666667e-05, + "loss": 0.523, + "step": 3020 + }, + { + "epoch": 2.164285714285714, + "grad_norm": 1.317818284034729, + "learning_rate": 4.0629629629629636e-05, + "loss": 0.3694, + "step": 3030 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 1.1630916595458984, + "learning_rate": 4.059259259259259e-05, + "loss": 0.3546, + "step": 3040 + }, + { + "epoch": 2.1785714285714284, + "grad_norm": 2.114527940750122, + "learning_rate": 4.055555555555556e-05, + "loss": 0.4838, + "step": 3050 + }, + { + "epoch": 2.185714285714286, + "grad_norm": 1.771263599395752, + "learning_rate": 4.051851851851852e-05, + "loss": 0.2938, + "step": 3060 + }, + { + "epoch": 2.192857142857143, + "grad_norm": 3.463986396789551, + "learning_rate": 4.048148148148148e-05, + "loss": 0.455, + "step": 3070 + }, + { + "epoch": 2.2, + "grad_norm": 2.023069381713867, + "learning_rate": 4.0444444444444444e-05, + "loss": 0.4449, + "step": 3080 + }, + { + "epoch": 2.2071428571428573, + "grad_norm": 2.9855751991271973, + "learning_rate": 4.040740740740741e-05, + "loss": 0.5374, + "step": 3090 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 2.422739267349243, + "learning_rate": 4.0370370370370374e-05, + "loss": 0.4203, + "step": 3100 + }, + { + "epoch": 2.2214285714285715, + "grad_norm": 2.097543478012085, + "learning_rate": 4.0333333333333336e-05, + "loss": 0.364, + "step": 3110 + }, + { + "epoch": 2.2285714285714286, + "grad_norm": 2.2496302127838135, + "learning_rate": 4.02962962962963e-05, + "loss": 0.4135, + "step": 3120 + }, + { + "epoch": 2.2357142857142858, + "grad_norm": 2.3347012996673584, + "learning_rate": 4.0259259259259266e-05, + "loss": 0.4795, + "step": 3130 + }, + { + "epoch": 2.242857142857143, + "grad_norm": 1.506218433380127, + "learning_rate": 4.022222222222222e-05, + "loss": 0.5228, + "step": 3140 + }, + { + "epoch": 2.25, + "grad_norm": 1.160443663597107, + "learning_rate": 4.018518518518519e-05, + "loss": 0.3439, + "step": 3150 + }, + { + "epoch": 2.257142857142857, + "grad_norm": 1.8678144216537476, + "learning_rate": 4.014814814814815e-05, + "loss": 0.404, + "step": 3160 + }, + { + "epoch": 2.2642857142857142, + "grad_norm": 1.1315560340881348, + "learning_rate": 4.011111111111111e-05, + "loss": 0.3111, + "step": 3170 + }, + { + "epoch": 2.2714285714285714, + "grad_norm": 1.8081461191177368, + "learning_rate": 4.007407407407407e-05, + "loss": 0.2717, + "step": 3180 + }, + { + "epoch": 2.2785714285714285, + "grad_norm": 1.6636005640029907, + "learning_rate": 4.003703703703704e-05, + "loss": 0.3382, + "step": 3190 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 1.3334009647369385, + "learning_rate": 4e-05, + "loss": 0.38, + "step": 3200 + }, + { + "epoch": 2.2928571428571427, + "grad_norm": 1.4873621463775635, + "learning_rate": 3.9962962962962965e-05, + "loss": 0.2979, + "step": 3210 + }, + { + "epoch": 2.3, + "grad_norm": 1.17378568649292, + "learning_rate": 3.9925925925925926e-05, + "loss": 0.3628, + "step": 3220 + }, + { + "epoch": 2.307142857142857, + "grad_norm": 1.3241777420043945, + "learning_rate": 3.9888888888888895e-05, + "loss": 0.3119, + "step": 3230 + }, + { + "epoch": 2.314285714285714, + "grad_norm": 1.9823285341262817, + "learning_rate": 3.985185185185185e-05, + "loss": 0.4647, + "step": 3240 + }, + { + "epoch": 2.3214285714285716, + "grad_norm": 1.6918193101882935, + "learning_rate": 3.981481481481482e-05, + "loss": 0.3695, + "step": 3250 + }, + { + "epoch": 2.3285714285714287, + "grad_norm": 2.1902389526367188, + "learning_rate": 3.977777777777778e-05, + "loss": 0.2468, + "step": 3260 + }, + { + "epoch": 2.335714285714286, + "grad_norm": 1.3570506572723389, + "learning_rate": 3.974074074074075e-05, + "loss": 0.3209, + "step": 3270 + }, + { + "epoch": 2.342857142857143, + "grad_norm": 1.951711654663086, + "learning_rate": 3.97037037037037e-05, + "loss": 0.5175, + "step": 3280 + }, + { + "epoch": 2.35, + "grad_norm": 1.741243839263916, + "learning_rate": 3.966666666666667e-05, + "loss": 0.2934, + "step": 3290 + }, + { + "epoch": 2.357142857142857, + "grad_norm": 1.5889472961425781, + "learning_rate": 3.962962962962963e-05, + "loss": 0.3026, + "step": 3300 + }, + { + "epoch": 2.3642857142857143, + "grad_norm": 1.4606213569641113, + "learning_rate": 3.9592592592592594e-05, + "loss": 0.4508, + "step": 3310 + }, + { + "epoch": 2.3714285714285714, + "grad_norm": 1.5021477937698364, + "learning_rate": 3.9555555555555556e-05, + "loss": 0.2589, + "step": 3320 + }, + { + "epoch": 2.3785714285714286, + "grad_norm": 1.8877885341644287, + "learning_rate": 3.9518518518518524e-05, + "loss": 0.4115, + "step": 3330 + }, + { + "epoch": 2.3857142857142857, + "grad_norm": 1.809822678565979, + "learning_rate": 3.9481481481481485e-05, + "loss": 0.3844, + "step": 3340 + }, + { + "epoch": 2.392857142857143, + "grad_norm": 1.2999638319015503, + "learning_rate": 3.944444444444445e-05, + "loss": 0.4028, + "step": 3350 + }, + { + "epoch": 2.4, + "grad_norm": 1.4639837741851807, + "learning_rate": 3.940740740740741e-05, + "loss": 0.3551, + "step": 3360 + }, + { + "epoch": 2.407142857142857, + "grad_norm": 1.1001754999160767, + "learning_rate": 3.937037037037038e-05, + "loss": 0.4001, + "step": 3370 + }, + { + "epoch": 2.414285714285714, + "grad_norm": 2.272892713546753, + "learning_rate": 3.933333333333333e-05, + "loss": 0.3048, + "step": 3380 + }, + { + "epoch": 2.4214285714285713, + "grad_norm": 2.085908889770508, + "learning_rate": 3.92962962962963e-05, + "loss": 0.5788, + "step": 3390 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 1.317700743675232, + "learning_rate": 3.925925925925926e-05, + "loss": 0.2922, + "step": 3400 + }, + { + "epoch": 2.435714285714286, + "grad_norm": 2.372558832168579, + "learning_rate": 3.922222222222223e-05, + "loss": 0.4581, + "step": 3410 + }, + { + "epoch": 2.442857142857143, + "grad_norm": 1.3307292461395264, + "learning_rate": 3.9185185185185185e-05, + "loss": 0.4553, + "step": 3420 + }, + { + "epoch": 2.45, + "grad_norm": 1.9228068590164185, + "learning_rate": 3.914814814814815e-05, + "loss": 0.497, + "step": 3430 + }, + { + "epoch": 2.4571428571428573, + "grad_norm": 1.071590542793274, + "learning_rate": 3.9111111111111115e-05, + "loss": 0.4532, + "step": 3440 + }, + { + "epoch": 2.4642857142857144, + "grad_norm": 1.9603391885757446, + "learning_rate": 3.9074074074074076e-05, + "loss": 0.3808, + "step": 3450 + }, + { + "epoch": 2.4714285714285715, + "grad_norm": 1.2152074575424194, + "learning_rate": 3.903703703703704e-05, + "loss": 0.3011, + "step": 3460 + }, + { + "epoch": 2.4785714285714286, + "grad_norm": 1.532478928565979, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.3403, + "step": 3470 + }, + { + "epoch": 2.4857142857142858, + "grad_norm": 1.5086220502853394, + "learning_rate": 3.896296296296296e-05, + "loss": 0.4835, + "step": 3480 + }, + { + "epoch": 2.492857142857143, + "grad_norm": 1.0601118803024292, + "learning_rate": 3.892592592592593e-05, + "loss": 0.4057, + "step": 3490 + }, + { + "epoch": 2.5, + "grad_norm": 0.7907903790473938, + "learning_rate": 3.888888888888889e-05, + "loss": 0.3183, + "step": 3500 + }, + { + "epoch": 2.507142857142857, + "grad_norm": 1.8523814678192139, + "learning_rate": 3.885185185185186e-05, + "loss": 0.4329, + "step": 3510 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 1.9627383947372437, + "learning_rate": 3.8814814814814814e-05, + "loss": 0.3041, + "step": 3520 + }, + { + "epoch": 2.5214285714285714, + "grad_norm": 0.6192536354064941, + "learning_rate": 3.877777777777778e-05, + "loss": 0.3271, + "step": 3530 + }, + { + "epoch": 2.5285714285714285, + "grad_norm": 1.3901042938232422, + "learning_rate": 3.8740740740740744e-05, + "loss": 0.2562, + "step": 3540 + }, + { + "epoch": 2.5357142857142856, + "grad_norm": 1.991752028465271, + "learning_rate": 3.8703703703703705e-05, + "loss": 0.4283, + "step": 3550 + }, + { + "epoch": 2.5428571428571427, + "grad_norm": 1.219382882118225, + "learning_rate": 3.866666666666667e-05, + "loss": 0.4232, + "step": 3560 + }, + { + "epoch": 2.55, + "grad_norm": 2.72744083404541, + "learning_rate": 3.8629629629629635e-05, + "loss": 0.3482, + "step": 3570 + }, + { + "epoch": 2.557142857142857, + "grad_norm": 1.6782621145248413, + "learning_rate": 3.85925925925926e-05, + "loss": 0.3302, + "step": 3580 + }, + { + "epoch": 2.564285714285714, + "grad_norm": 1.0238265991210938, + "learning_rate": 3.855555555555556e-05, + "loss": 0.458, + "step": 3590 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 2.212013006210327, + "learning_rate": 3.851851851851852e-05, + "loss": 0.4127, + "step": 3600 + }, + { + "epoch": 2.5785714285714287, + "grad_norm": 1.5761399269104004, + "learning_rate": 3.848148148148149e-05, + "loss": 0.402, + "step": 3610 + }, + { + "epoch": 2.585714285714286, + "grad_norm": 1.2036465406417847, + "learning_rate": 3.844444444444444e-05, + "loss": 0.5576, + "step": 3620 + }, + { + "epoch": 2.592857142857143, + "grad_norm": 1.8674002885818481, + "learning_rate": 3.840740740740741e-05, + "loss": 0.364, + "step": 3630 + }, + { + "epoch": 2.6, + "grad_norm": 1.466834545135498, + "learning_rate": 3.837037037037037e-05, + "loss": 0.3523, + "step": 3640 + }, + { + "epoch": 2.607142857142857, + "grad_norm": 1.57899010181427, + "learning_rate": 3.8333333333333334e-05, + "loss": 0.4046, + "step": 3650 + }, + { + "epoch": 2.6142857142857143, + "grad_norm": 0.9730345010757446, + "learning_rate": 3.8296296296296296e-05, + "loss": 0.3132, + "step": 3660 + }, + { + "epoch": 2.6214285714285714, + "grad_norm": 1.3017544746398926, + "learning_rate": 3.8259259259259264e-05, + "loss": 0.3023, + "step": 3670 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 1.6368205547332764, + "learning_rate": 3.8222222222222226e-05, + "loss": 0.4363, + "step": 3680 + }, + { + "epoch": 2.6357142857142857, + "grad_norm": 1.2852121591567993, + "learning_rate": 3.818518518518519e-05, + "loss": 0.2896, + "step": 3690 + }, + { + "epoch": 2.642857142857143, + "grad_norm": 3.6991841793060303, + "learning_rate": 3.814814814814815e-05, + "loss": 0.353, + "step": 3700 + }, + { + "epoch": 2.65, + "grad_norm": 2.70285701751709, + "learning_rate": 3.811111111111112e-05, + "loss": 0.4217, + "step": 3710 + }, + { + "epoch": 2.657142857142857, + "grad_norm": 1.140811800956726, + "learning_rate": 3.807407407407408e-05, + "loss": 0.3253, + "step": 3720 + }, + { + "epoch": 2.664285714285714, + "grad_norm": 1.2905789613723755, + "learning_rate": 3.803703703703704e-05, + "loss": 0.3051, + "step": 3730 + }, + { + "epoch": 2.6714285714285713, + "grad_norm": 1.4326887130737305, + "learning_rate": 3.8e-05, + "loss": 0.3999, + "step": 3740 + }, + { + "epoch": 2.678571428571429, + "grad_norm": 1.1789475679397583, + "learning_rate": 3.7962962962962964e-05, + "loss": 0.4631, + "step": 3750 + }, + { + "epoch": 2.685714285714286, + "grad_norm": 2.0444328784942627, + "learning_rate": 3.7925925925925925e-05, + "loss": 0.4449, + "step": 3760 + }, + { + "epoch": 2.692857142857143, + "grad_norm": 2.1025991439819336, + "learning_rate": 3.7888888888888894e-05, + "loss": 0.3513, + "step": 3770 + }, + { + "epoch": 2.7, + "grad_norm": 1.8492026329040527, + "learning_rate": 3.7851851851851855e-05, + "loss": 0.4006, + "step": 3780 + }, + { + "epoch": 2.7071428571428573, + "grad_norm": 1.3439162969589233, + "learning_rate": 3.781481481481482e-05, + "loss": 0.2806, + "step": 3790 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 1.4200384616851807, + "learning_rate": 3.777777777777778e-05, + "loss": 0.3759, + "step": 3800 + }, + { + "epoch": 2.7214285714285715, + "grad_norm": 1.9567861557006836, + "learning_rate": 3.774074074074074e-05, + "loss": 0.1772, + "step": 3810 + }, + { + "epoch": 2.7285714285714286, + "grad_norm": 1.3466306924819946, + "learning_rate": 3.770370370370371e-05, + "loss": 0.399, + "step": 3820 + }, + { + "epoch": 2.7357142857142858, + "grad_norm": 1.6046024560928345, + "learning_rate": 3.766666666666667e-05, + "loss": 0.347, + "step": 3830 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 2.190568447113037, + "learning_rate": 3.762962962962963e-05, + "loss": 0.3977, + "step": 3840 + }, + { + "epoch": 2.75, + "grad_norm": 1.7715718746185303, + "learning_rate": 3.759259259259259e-05, + "loss": 0.4385, + "step": 3850 + }, + { + "epoch": 2.757142857142857, + "grad_norm": 3.19500994682312, + "learning_rate": 3.7555555555555554e-05, + "loss": 0.3631, + "step": 3860 + }, + { + "epoch": 2.7642857142857142, + "grad_norm": 2.2222607135772705, + "learning_rate": 3.751851851851852e-05, + "loss": 0.3565, + "step": 3870 + }, + { + "epoch": 2.7714285714285714, + "grad_norm": 1.9959403276443481, + "learning_rate": 3.7481481481481484e-05, + "loss": 0.3629, + "step": 3880 + }, + { + "epoch": 2.7785714285714285, + "grad_norm": 1.3207546472549438, + "learning_rate": 3.7444444444444446e-05, + "loss": 0.2911, + "step": 3890 + }, + { + "epoch": 2.7857142857142856, + "grad_norm": 2.0290961265563965, + "learning_rate": 3.740740740740741e-05, + "loss": 0.3072, + "step": 3900 + }, + { + "epoch": 2.7928571428571427, + "grad_norm": 1.3728725910186768, + "learning_rate": 3.737037037037037e-05, + "loss": 0.3858, + "step": 3910 + }, + { + "epoch": 2.8, + "grad_norm": 2.541598320007324, + "learning_rate": 3.733333333333334e-05, + "loss": 0.3487, + "step": 3920 + }, + { + "epoch": 2.807142857142857, + "grad_norm": 2.3327584266662598, + "learning_rate": 3.72962962962963e-05, + "loss": 0.3535, + "step": 3930 + }, + { + "epoch": 2.814285714285714, + "grad_norm": 2.546766757965088, + "learning_rate": 3.725925925925926e-05, + "loss": 0.3462, + "step": 3940 + }, + { + "epoch": 2.821428571428571, + "grad_norm": 2.351959705352783, + "learning_rate": 3.722222222222222e-05, + "loss": 0.2781, + "step": 3950 + }, + { + "epoch": 2.8285714285714287, + "grad_norm": 1.9349900484085083, + "learning_rate": 3.718518518518519e-05, + "loss": 0.2442, + "step": 3960 + }, + { + "epoch": 2.835714285714286, + "grad_norm": 2.2020022869110107, + "learning_rate": 3.714814814814815e-05, + "loss": 0.3396, + "step": 3970 + }, + { + "epoch": 2.842857142857143, + "grad_norm": 1.5161465406417847, + "learning_rate": 3.7111111111111113e-05, + "loss": 0.3722, + "step": 3980 + }, + { + "epoch": 2.85, + "grad_norm": 1.7403453588485718, + "learning_rate": 3.7074074074074075e-05, + "loss": 0.4227, + "step": 3990 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 1.9142546653747559, + "learning_rate": 3.7037037037037037e-05, + "loss": 0.3259, + "step": 4000 + }, + { + "epoch": 2.857142857142857, + "eval_loss": 0.38673722743988037, + "eval_rouge1": 0.8974, + "eval_rouge2": 0.8331, + "eval_rougeL": 0.8942, + "eval_runtime": 122.1383, + "eval_samples_per_second": 11.462, + "eval_steps_per_second": 5.731, + "step": 4000 + }, + { + "epoch": 2.8642857142857143, + "grad_norm": 1.5975255966186523, + "learning_rate": 3.7e-05, + "loss": 0.3732, + "step": 4010 + }, + { + "epoch": 2.8714285714285714, + "grad_norm": 1.4830248355865479, + "learning_rate": 3.6962962962962966e-05, + "loss": 0.5093, + "step": 4020 + }, + { + "epoch": 2.8785714285714286, + "grad_norm": 2.504650354385376, + "learning_rate": 3.692592592592593e-05, + "loss": 0.3302, + "step": 4030 + }, + { + "epoch": 2.8857142857142857, + "grad_norm": 2.349452495574951, + "learning_rate": 3.688888888888889e-05, + "loss": 0.3596, + "step": 4040 + }, + { + "epoch": 2.892857142857143, + "grad_norm": 1.398964762687683, + "learning_rate": 3.685185185185185e-05, + "loss": 0.3494, + "step": 4050 + }, + { + "epoch": 2.9, + "grad_norm": 2.212738513946533, + "learning_rate": 3.681481481481482e-05, + "loss": 0.3691, + "step": 4060 + }, + { + "epoch": 2.907142857142857, + "grad_norm": 2.20845627784729, + "learning_rate": 3.677777777777778e-05, + "loss": 0.2974, + "step": 4070 + }, + { + "epoch": 2.914285714285714, + "grad_norm": 1.2226334810256958, + "learning_rate": 3.674074074074074e-05, + "loss": 0.3173, + "step": 4080 + }, + { + "epoch": 2.9214285714285713, + "grad_norm": 2.2203428745269775, + "learning_rate": 3.6703703703703704e-05, + "loss": 0.4473, + "step": 4090 + }, + { + "epoch": 2.928571428571429, + "grad_norm": 1.487853765487671, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.2653, + "step": 4100 + }, + { + "epoch": 2.935714285714286, + "grad_norm": 1.6347614526748657, + "learning_rate": 3.662962962962963e-05, + "loss": 0.3563, + "step": 4110 + }, + { + "epoch": 2.942857142857143, + "grad_norm": 2.2722184658050537, + "learning_rate": 3.6592592592592596e-05, + "loss": 0.4975, + "step": 4120 + }, + { + "epoch": 2.95, + "grad_norm": 1.747530460357666, + "learning_rate": 3.655555555555556e-05, + "loss": 0.2357, + "step": 4130 + }, + { + "epoch": 2.9571428571428573, + "grad_norm": 1.628596544265747, + "learning_rate": 3.651851851851852e-05, + "loss": 0.3674, + "step": 4140 + }, + { + "epoch": 2.9642857142857144, + "grad_norm": 1.0486435890197754, + "learning_rate": 3.648148148148148e-05, + "loss": 0.3314, + "step": 4150 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 2.523879289627075, + "learning_rate": 3.644444444444445e-05, + "loss": 0.4421, + "step": 4160 + }, + { + "epoch": 2.9785714285714286, + "grad_norm": 1.4641958475112915, + "learning_rate": 3.6407407407407403e-05, + "loss": 0.4135, + "step": 4170 + }, + { + "epoch": 2.9857142857142858, + "grad_norm": 2.672769784927368, + "learning_rate": 3.637037037037037e-05, + "loss": 0.3527, + "step": 4180 + }, + { + "epoch": 2.992857142857143, + "grad_norm": 0.5795308351516724, + "learning_rate": 3.633333333333333e-05, + "loss": 0.2326, + "step": 4190 + }, + { + "epoch": 3.0, + "grad_norm": 1.873579978942871, + "learning_rate": 3.62962962962963e-05, + "loss": 0.3679, + "step": 4200 + }, + { + "epoch": 3.007142857142857, + "grad_norm": 1.7640775442123413, + "learning_rate": 3.6259259259259256e-05, + "loss": 0.4778, + "step": 4210 + }, + { + "epoch": 3.0142857142857142, + "grad_norm": 1.9458075761795044, + "learning_rate": 3.6222222222222225e-05, + "loss": 0.4054, + "step": 4220 + }, + { + "epoch": 3.0214285714285714, + "grad_norm": 1.1568126678466797, + "learning_rate": 3.6185185185185186e-05, + "loss": 0.2249, + "step": 4230 + }, + { + "epoch": 3.0285714285714285, + "grad_norm": 1.3655381202697754, + "learning_rate": 3.614814814814815e-05, + "loss": 0.3993, + "step": 4240 + }, + { + "epoch": 3.0357142857142856, + "grad_norm": 2.0403196811676025, + "learning_rate": 3.611111111111111e-05, + "loss": 0.3366, + "step": 4250 + }, + { + "epoch": 3.0428571428571427, + "grad_norm": 1.9888697862625122, + "learning_rate": 3.607407407407408e-05, + "loss": 0.3033, + "step": 4260 + }, + { + "epoch": 3.05, + "grad_norm": 1.3648616075515747, + "learning_rate": 3.603703703703704e-05, + "loss": 0.2874, + "step": 4270 + }, + { + "epoch": 3.057142857142857, + "grad_norm": 2.602613925933838, + "learning_rate": 3.6e-05, + "loss": 0.4086, + "step": 4280 + }, + { + "epoch": 3.064285714285714, + "grad_norm": 2.5918185710906982, + "learning_rate": 3.596296296296296e-05, + "loss": 0.393, + "step": 4290 + }, + { + "epoch": 3.0714285714285716, + "grad_norm": 1.8195433616638184, + "learning_rate": 3.592592592592593e-05, + "loss": 0.3361, + "step": 4300 + }, + { + "epoch": 3.0785714285714287, + "grad_norm": 1.8855136632919312, + "learning_rate": 3.5888888888888886e-05, + "loss": 0.3205, + "step": 4310 + }, + { + "epoch": 3.085714285714286, + "grad_norm": 2.7412662506103516, + "learning_rate": 3.5851851851851854e-05, + "loss": 0.2659, + "step": 4320 + }, + { + "epoch": 3.092857142857143, + "grad_norm": 1.880436658859253, + "learning_rate": 3.5814814814814815e-05, + "loss": 0.49, + "step": 4330 + }, + { + "epoch": 3.1, + "grad_norm": 1.6828274726867676, + "learning_rate": 3.577777777777778e-05, + "loss": 0.2933, + "step": 4340 + }, + { + "epoch": 3.107142857142857, + "grad_norm": 1.0517287254333496, + "learning_rate": 3.574074074074074e-05, + "loss": 0.3563, + "step": 4350 + }, + { + "epoch": 3.1142857142857143, + "grad_norm": 1.3242154121398926, + "learning_rate": 3.570370370370371e-05, + "loss": 0.3765, + "step": 4360 + }, + { + "epoch": 3.1214285714285714, + "grad_norm": 2.0899312496185303, + "learning_rate": 3.566666666666667e-05, + "loss": 0.3664, + "step": 4370 + }, + { + "epoch": 3.1285714285714286, + "grad_norm": 2.0286014080047607, + "learning_rate": 3.562962962962963e-05, + "loss": 0.2622, + "step": 4380 + }, + { + "epoch": 3.1357142857142857, + "grad_norm": 2.5074400901794434, + "learning_rate": 3.559259259259259e-05, + "loss": 0.321, + "step": 4390 + }, + { + "epoch": 3.142857142857143, + "grad_norm": 1.4080287218093872, + "learning_rate": 3.555555555555556e-05, + "loss": 0.4035, + "step": 4400 + }, + { + "epoch": 3.15, + "grad_norm": 1.923890471458435, + "learning_rate": 3.5518518518518515e-05, + "loss": 0.2775, + "step": 4410 + }, + { + "epoch": 3.157142857142857, + "grad_norm": 0.806591272354126, + "learning_rate": 3.548148148148148e-05, + "loss": 0.3149, + "step": 4420 + }, + { + "epoch": 3.164285714285714, + "grad_norm": 2.197736978530884, + "learning_rate": 3.5444444444444445e-05, + "loss": 0.4368, + "step": 4430 + }, + { + "epoch": 3.1714285714285713, + "grad_norm": 1.6943881511688232, + "learning_rate": 3.540740740740741e-05, + "loss": 0.2793, + "step": 4440 + }, + { + "epoch": 3.1785714285714284, + "grad_norm": 2.5460283756256104, + "learning_rate": 3.537037037037037e-05, + "loss": 0.4057, + "step": 4450 + }, + { + "epoch": 3.185714285714286, + "grad_norm": 1.579908013343811, + "learning_rate": 3.5333333333333336e-05, + "loss": 0.3016, + "step": 4460 + }, + { + "epoch": 3.192857142857143, + "grad_norm": 1.9137247800827026, + "learning_rate": 3.52962962962963e-05, + "loss": 0.3437, + "step": 4470 + }, + { + "epoch": 3.2, + "grad_norm": 2.510328769683838, + "learning_rate": 3.525925925925926e-05, + "loss": 0.585, + "step": 4480 + }, + { + "epoch": 3.2071428571428573, + "grad_norm": 0.9775506854057312, + "learning_rate": 3.522222222222222e-05, + "loss": 0.2651, + "step": 4490 + }, + { + "epoch": 3.2142857142857144, + "grad_norm": 1.7614684104919434, + "learning_rate": 3.518518518518519e-05, + "loss": 0.3089, + "step": 4500 + }, + { + "epoch": 3.2214285714285715, + "grad_norm": 1.9103621244430542, + "learning_rate": 3.514814814814815e-05, + "loss": 0.342, + "step": 4510 + }, + { + "epoch": 3.2285714285714286, + "grad_norm": 1.4587639570236206, + "learning_rate": 3.511111111111111e-05, + "loss": 0.2592, + "step": 4520 + }, + { + "epoch": 3.2357142857142858, + "grad_norm": 1.3419288396835327, + "learning_rate": 3.5074074074074074e-05, + "loss": 0.4185, + "step": 4530 + }, + { + "epoch": 3.242857142857143, + "grad_norm": 1.6199047565460205, + "learning_rate": 3.503703703703704e-05, + "loss": 0.256, + "step": 4540 + }, + { + "epoch": 3.25, + "grad_norm": 1.230350136756897, + "learning_rate": 3.5e-05, + "loss": 0.3304, + "step": 4550 + }, + { + "epoch": 3.257142857142857, + "grad_norm": 3.087888240814209, + "learning_rate": 3.4962962962962965e-05, + "loss": 0.3351, + "step": 4560 + }, + { + "epoch": 3.2642857142857142, + "grad_norm": 1.4498260021209717, + "learning_rate": 3.492592592592593e-05, + "loss": 0.2753, + "step": 4570 + }, + { + "epoch": 3.2714285714285714, + "grad_norm": 1.1032336950302124, + "learning_rate": 3.4888888888888895e-05, + "loss": 0.3709, + "step": 4580 + }, + { + "epoch": 3.2785714285714285, + "grad_norm": 1.5177497863769531, + "learning_rate": 3.485185185185185e-05, + "loss": 0.276, + "step": 4590 + }, + { + "epoch": 3.2857142857142856, + "grad_norm": 1.2596136331558228, + "learning_rate": 3.481481481481482e-05, + "loss": 0.3482, + "step": 4600 + }, + { + "epoch": 3.2928571428571427, + "grad_norm": 1.9895663261413574, + "learning_rate": 3.477777777777778e-05, + "loss": 0.3738, + "step": 4610 + }, + { + "epoch": 3.3, + "grad_norm": 1.2930881977081299, + "learning_rate": 3.474074074074074e-05, + "loss": 0.4263, + "step": 4620 + }, + { + "epoch": 3.307142857142857, + "grad_norm": 2.276385545730591, + "learning_rate": 3.47037037037037e-05, + "loss": 0.2267, + "step": 4630 + }, + { + "epoch": 3.314285714285714, + "grad_norm": 0.9766007661819458, + "learning_rate": 3.466666666666667e-05, + "loss": 0.2217, + "step": 4640 + }, + { + "epoch": 3.3214285714285716, + "grad_norm": 1.5184674263000488, + "learning_rate": 3.4629629629629626e-05, + "loss": 0.2788, + "step": 4650 + }, + { + "epoch": 3.3285714285714287, + "grad_norm": 1.5145732164382935, + "learning_rate": 3.4592592592592594e-05, + "loss": 0.3291, + "step": 4660 + }, + { + "epoch": 3.335714285714286, + "grad_norm": 1.4273874759674072, + "learning_rate": 3.4555555555555556e-05, + "loss": 0.2854, + "step": 4670 + }, + { + "epoch": 3.342857142857143, + "grad_norm": 2.783701181411743, + "learning_rate": 3.4518518518518524e-05, + "loss": 0.3518, + "step": 4680 + }, + { + "epoch": 3.35, + "grad_norm": 1.3359688520431519, + "learning_rate": 3.448148148148148e-05, + "loss": 0.2239, + "step": 4690 + }, + { + "epoch": 3.357142857142857, + "grad_norm": 2.246824264526367, + "learning_rate": 3.444444444444445e-05, + "loss": 0.3206, + "step": 4700 + }, + { + "epoch": 3.3642857142857143, + "grad_norm": 1.7839916944503784, + "learning_rate": 3.440740740740741e-05, + "loss": 0.3189, + "step": 4710 + }, + { + "epoch": 3.3714285714285714, + "grad_norm": 1.0196881294250488, + "learning_rate": 3.437037037037037e-05, + "loss": 0.2318, + "step": 4720 + }, + { + "epoch": 3.3785714285714286, + "grad_norm": 2.228317975997925, + "learning_rate": 3.433333333333333e-05, + "loss": 0.4033, + "step": 4730 + }, + { + "epoch": 3.3857142857142857, + "grad_norm": 2.0231473445892334, + "learning_rate": 3.42962962962963e-05, + "loss": 0.3854, + "step": 4740 + }, + { + "epoch": 3.392857142857143, + "grad_norm": 2.074925422668457, + "learning_rate": 3.425925925925926e-05, + "loss": 0.3778, + "step": 4750 + }, + { + "epoch": 3.4, + "grad_norm": 1.2508392333984375, + "learning_rate": 3.4222222222222224e-05, + "loss": 0.3299, + "step": 4760 + }, + { + "epoch": 3.407142857142857, + "grad_norm": 1.0920076370239258, + "learning_rate": 3.4185185185185185e-05, + "loss": 0.3798, + "step": 4770 + }, + { + "epoch": 3.414285714285714, + "grad_norm": 1.8113828897476196, + "learning_rate": 3.4148148148148153e-05, + "loss": 0.2903, + "step": 4780 + }, + { + "epoch": 3.4214285714285713, + "grad_norm": 1.6218737363815308, + "learning_rate": 3.411111111111111e-05, + "loss": 0.2593, + "step": 4790 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 1.0635234117507935, + "learning_rate": 3.4074074074074077e-05, + "loss": 0.4388, + "step": 4800 + }, + { + "epoch": 3.435714285714286, + "grad_norm": 2.585700273513794, + "learning_rate": 3.403703703703704e-05, + "loss": 0.3368, + "step": 4810 + }, + { + "epoch": 3.442857142857143, + "grad_norm": 1.0704694986343384, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.2196, + "step": 4820 + }, + { + "epoch": 3.45, + "grad_norm": 1.3177589178085327, + "learning_rate": 3.396296296296296e-05, + "loss": 0.3104, + "step": 4830 + }, + { + "epoch": 3.4571428571428573, + "grad_norm": 1.834241271018982, + "learning_rate": 3.392592592592593e-05, + "loss": 0.3413, + "step": 4840 + }, + { + "epoch": 3.4642857142857144, + "grad_norm": 1.8859339952468872, + "learning_rate": 3.388888888888889e-05, + "loss": 0.2593, + "step": 4850 + }, + { + "epoch": 3.4714285714285715, + "grad_norm": 1.452728271484375, + "learning_rate": 3.385185185185185e-05, + "loss": 0.3029, + "step": 4860 + }, + { + "epoch": 3.4785714285714286, + "grad_norm": 2.170774221420288, + "learning_rate": 3.3814814814814814e-05, + "loss": 0.3372, + "step": 4870 + }, + { + "epoch": 3.4857142857142858, + "grad_norm": 1.8695834875106812, + "learning_rate": 3.377777777777778e-05, + "loss": 0.3428, + "step": 4880 + }, + { + "epoch": 3.492857142857143, + "grad_norm": 1.74647855758667, + "learning_rate": 3.3740740740740744e-05, + "loss": 0.3351, + "step": 4890 + }, + { + "epoch": 3.5, + "grad_norm": 2.3349127769470215, + "learning_rate": 3.3703703703703706e-05, + "loss": 0.2733, + "step": 4900 + }, + { + "epoch": 3.507142857142857, + "grad_norm": 2.73463773727417, + "learning_rate": 3.366666666666667e-05, + "loss": 0.2979, + "step": 4910 + }, + { + "epoch": 3.5142857142857142, + "grad_norm": 1.3546210527420044, + "learning_rate": 3.3629629629629636e-05, + "loss": 0.3521, + "step": 4920 + }, + { + "epoch": 3.5214285714285714, + "grad_norm": 1.617336630821228, + "learning_rate": 3.359259259259259e-05, + "loss": 0.2758, + "step": 4930 + }, + { + "epoch": 3.5285714285714285, + "grad_norm": 2.998967409133911, + "learning_rate": 3.355555555555556e-05, + "loss": 0.4193, + "step": 4940 + }, + { + "epoch": 3.5357142857142856, + "grad_norm": 1.8004390001296997, + "learning_rate": 3.351851851851852e-05, + "loss": 0.3936, + "step": 4950 + }, + { + "epoch": 3.5428571428571427, + "grad_norm": 1.4228971004486084, + "learning_rate": 3.348148148148148e-05, + "loss": 0.3563, + "step": 4960 + }, + { + "epoch": 3.55, + "grad_norm": 1.5617480278015137, + "learning_rate": 3.3444444444444443e-05, + "loss": 0.2492, + "step": 4970 + }, + { + "epoch": 3.557142857142857, + "grad_norm": 1.3880919218063354, + "learning_rate": 3.340740740740741e-05, + "loss": 0.1791, + "step": 4980 + }, + { + "epoch": 3.564285714285714, + "grad_norm": 2.3505630493164062, + "learning_rate": 3.337037037037037e-05, + "loss": 0.4009, + "step": 4990 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 0.9086794853210449, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.2826, + "step": 5000 + }, + { + "epoch": 3.571428571428571, + "eval_loss": 0.3789908289909363, + "eval_rouge1": 0.8999, + "eval_rouge2": 0.8372, + "eval_rougeL": 0.8969, + "eval_runtime": 122.23, + "eval_samples_per_second": 11.454, + "eval_steps_per_second": 5.727, + "step": 5000 + }, + { + "epoch": 3.5785714285714287, + "grad_norm": 1.0208678245544434, + "learning_rate": 3.3296296296296296e-05, + "loss": 0.295, + "step": 5010 + }, + { + "epoch": 3.585714285714286, + "grad_norm": 3.03141713142395, + "learning_rate": 3.3259259259259265e-05, + "loss": 0.3813, + "step": 5020 + }, + { + "epoch": 3.592857142857143, + "grad_norm": 1.7845333814620972, + "learning_rate": 3.322222222222222e-05, + "loss": 0.2526, + "step": 5030 + }, + { + "epoch": 3.6, + "grad_norm": 4.314096450805664, + "learning_rate": 3.318518518518519e-05, + "loss": 0.3498, + "step": 5040 + }, + { + "epoch": 3.607142857142857, + "grad_norm": 1.5270274877548218, + "learning_rate": 3.314814814814815e-05, + "loss": 0.3204, + "step": 5050 + }, + { + "epoch": 3.6142857142857143, + "grad_norm": 2.036738157272339, + "learning_rate": 3.311111111111112e-05, + "loss": 0.3416, + "step": 5060 + }, + { + "epoch": 3.6214285714285714, + "grad_norm": 2.2504570484161377, + "learning_rate": 3.307407407407407e-05, + "loss": 0.3781, + "step": 5070 + }, + { + "epoch": 3.6285714285714286, + "grad_norm": 1.749518632888794, + "learning_rate": 3.303703703703704e-05, + "loss": 0.2299, + "step": 5080 + }, + { + "epoch": 3.6357142857142857, + "grad_norm": 2.1878907680511475, + "learning_rate": 3.3e-05, + "loss": 0.3692, + "step": 5090 + }, + { + "epoch": 3.642857142857143, + "grad_norm": 1.829394817352295, + "learning_rate": 3.2962962962962964e-05, + "loss": 0.3095, + "step": 5100 + }, + { + "epoch": 3.65, + "grad_norm": 2.5994794368743896, + "learning_rate": 3.2925925925925926e-05, + "loss": 0.431, + "step": 5110 + }, + { + "epoch": 3.657142857142857, + "grad_norm": 1.2319742441177368, + "learning_rate": 3.2888888888888894e-05, + "loss": 0.336, + "step": 5120 + }, + { + "epoch": 3.664285714285714, + "grad_norm": 2.169063091278076, + "learning_rate": 3.2851851851851856e-05, + "loss": 0.293, + "step": 5130 + }, + { + "epoch": 3.6714285714285713, + "grad_norm": 1.7120137214660645, + "learning_rate": 3.281481481481482e-05, + "loss": 0.3439, + "step": 5140 + }, + { + "epoch": 3.678571428571429, + "grad_norm": 1.5415689945220947, + "learning_rate": 3.277777777777778e-05, + "loss": 0.3912, + "step": 5150 + }, + { + "epoch": 3.685714285714286, + "grad_norm": 2.2880282402038574, + "learning_rate": 3.274074074074075e-05, + "loss": 0.2352, + "step": 5160 + }, + { + "epoch": 3.692857142857143, + "grad_norm": 1.7133980989456177, + "learning_rate": 3.27037037037037e-05, + "loss": 0.5397, + "step": 5170 + }, + { + "epoch": 3.7, + "grad_norm": 1.9661128520965576, + "learning_rate": 3.266666666666667e-05, + "loss": 0.4496, + "step": 5180 + }, + { + "epoch": 3.7071428571428573, + "grad_norm": 1.444551944732666, + "learning_rate": 3.262962962962963e-05, + "loss": 0.3201, + "step": 5190 + }, + { + "epoch": 3.7142857142857144, + "grad_norm": 1.7919954061508179, + "learning_rate": 3.25925925925926e-05, + "loss": 0.3721, + "step": 5200 + }, + { + "epoch": 3.7214285714285715, + "grad_norm": 2.4862735271453857, + "learning_rate": 3.2555555555555555e-05, + "loss": 0.2511, + "step": 5210 + }, + { + "epoch": 3.7285714285714286, + "grad_norm": 1.0694047212600708, + "learning_rate": 3.251851851851852e-05, + "loss": 0.1418, + "step": 5220 + }, + { + "epoch": 3.7357142857142858, + "grad_norm": 2.4438931941986084, + "learning_rate": 3.2481481481481485e-05, + "loss": 0.2473, + "step": 5230 + }, + { + "epoch": 3.742857142857143, + "grad_norm": 1.9673523902893066, + "learning_rate": 3.2444444444444446e-05, + "loss": 0.3251, + "step": 5240 + }, + { + "epoch": 3.75, + "grad_norm": 2.5299620628356934, + "learning_rate": 3.240740740740741e-05, + "loss": 0.3862, + "step": 5250 + }, + { + "epoch": 3.757142857142857, + "grad_norm": 1.1709238290786743, + "learning_rate": 3.2370370370370376e-05, + "loss": 0.3156, + "step": 5260 + }, + { + "epoch": 3.7642857142857142, + "grad_norm": 1.4275505542755127, + "learning_rate": 3.233333333333333e-05, + "loss": 0.3091, + "step": 5270 + }, + { + "epoch": 3.7714285714285714, + "grad_norm": 1.5278127193450928, + "learning_rate": 3.22962962962963e-05, + "loss": 0.3768, + "step": 5280 + }, + { + "epoch": 3.7785714285714285, + "grad_norm": 2.870471239089966, + "learning_rate": 3.225925925925926e-05, + "loss": 0.4264, + "step": 5290 + }, + { + "epoch": 3.7857142857142856, + "grad_norm": 1.4797722101211548, + "learning_rate": 3.222222222222223e-05, + "loss": 0.3598, + "step": 5300 + }, + { + "epoch": 3.7928571428571427, + "grad_norm": 1.6350576877593994, + "learning_rate": 3.2185185185185184e-05, + "loss": 0.2125, + "step": 5310 + }, + { + "epoch": 3.8, + "grad_norm": 1.8790502548217773, + "learning_rate": 3.214814814814815e-05, + "loss": 0.2698, + "step": 5320 + }, + { + "epoch": 3.807142857142857, + "grad_norm": 1.3930083513259888, + "learning_rate": 3.2111111111111114e-05, + "loss": 0.3867, + "step": 5330 + }, + { + "epoch": 3.814285714285714, + "grad_norm": 1.7605199813842773, + "learning_rate": 3.2074074074074075e-05, + "loss": 0.3594, + "step": 5340 + }, + { + "epoch": 3.821428571428571, + "grad_norm": 2.3873794078826904, + "learning_rate": 3.203703703703704e-05, + "loss": 0.372, + "step": 5350 + }, + { + "epoch": 3.8285714285714287, + "grad_norm": 3.087186098098755, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.3964, + "step": 5360 + }, + { + "epoch": 3.835714285714286, + "grad_norm": 1.6758490800857544, + "learning_rate": 3.196296296296297e-05, + "loss": 0.3274, + "step": 5370 + }, + { + "epoch": 3.842857142857143, + "grad_norm": 1.184205412864685, + "learning_rate": 3.192592592592593e-05, + "loss": 0.277, + "step": 5380 + }, + { + "epoch": 3.85, + "grad_norm": 2.1282460689544678, + "learning_rate": 3.188888888888889e-05, + "loss": 0.3283, + "step": 5390 + }, + { + "epoch": 3.857142857142857, + "grad_norm": 1.9244283437728882, + "learning_rate": 3.185185185185185e-05, + "loss": 0.2732, + "step": 5400 + }, + { + "epoch": 3.8642857142857143, + "grad_norm": 1.2328709363937378, + "learning_rate": 3.181481481481481e-05, + "loss": 0.2968, + "step": 5410 + }, + { + "epoch": 3.8714285714285714, + "grad_norm": 2.5490071773529053, + "learning_rate": 3.177777777777778e-05, + "loss": 0.3258, + "step": 5420 + }, + { + "epoch": 3.8785714285714286, + "grad_norm": 1.7774560451507568, + "learning_rate": 3.174074074074074e-05, + "loss": 0.3274, + "step": 5430 + }, + { + "epoch": 3.8857142857142857, + "grad_norm": 0.9900962710380554, + "learning_rate": 3.1703703703703705e-05, + "loss": 0.3361, + "step": 5440 + }, + { + "epoch": 3.892857142857143, + "grad_norm": 1.2809844017028809, + "learning_rate": 3.1666666666666666e-05, + "loss": 0.3684, + "step": 5450 + }, + { + "epoch": 3.9, + "grad_norm": 2.2611334323883057, + "learning_rate": 3.1629629629629634e-05, + "loss": 0.326, + "step": 5460 + }, + { + "epoch": 3.907142857142857, + "grad_norm": 2.49057936668396, + "learning_rate": 3.1592592592592596e-05, + "loss": 0.412, + "step": 5470 + }, + { + "epoch": 3.914285714285714, + "grad_norm": 1.6978118419647217, + "learning_rate": 3.155555555555556e-05, + "loss": 0.2177, + "step": 5480 + }, + { + "epoch": 3.9214285714285713, + "grad_norm": 1.847128987312317, + "learning_rate": 3.151851851851852e-05, + "loss": 0.3419, + "step": 5490 + }, + { + "epoch": 3.928571428571429, + "grad_norm": 1.6806657314300537, + "learning_rate": 3.148148148148148e-05, + "loss": 0.1479, + "step": 5500 + }, + { + "epoch": 3.935714285714286, + "grad_norm": 2.144227981567383, + "learning_rate": 3.144444444444445e-05, + "loss": 0.3098, + "step": 5510 + }, + { + "epoch": 3.942857142857143, + "grad_norm": 1.2945857048034668, + "learning_rate": 3.140740740740741e-05, + "loss": 0.269, + "step": 5520 + }, + { + "epoch": 3.95, + "grad_norm": 1.8362900018692017, + "learning_rate": 3.137037037037037e-05, + "loss": 0.3065, + "step": 5530 + }, + { + "epoch": 3.9571428571428573, + "grad_norm": 1.9124987125396729, + "learning_rate": 3.1333333333333334e-05, + "loss": 0.2593, + "step": 5540 + }, + { + "epoch": 3.9642857142857144, + "grad_norm": 1.726523995399475, + "learning_rate": 3.1296296296296295e-05, + "loss": 0.3112, + "step": 5550 + }, + { + "epoch": 3.9714285714285715, + "grad_norm": 1.5914565324783325, + "learning_rate": 3.1259259259259264e-05, + "loss": 0.263, + "step": 5560 + }, + { + "epoch": 3.9785714285714286, + "grad_norm": 1.3533891439437866, + "learning_rate": 3.1222222222222225e-05, + "loss": 0.3852, + "step": 5570 + }, + { + "epoch": 3.9857142857142858, + "grad_norm": 2.1844253540039062, + "learning_rate": 3.118518518518519e-05, + "loss": 0.3761, + "step": 5580 + }, + { + "epoch": 3.992857142857143, + "grad_norm": 2.494920492172241, + "learning_rate": 3.114814814814815e-05, + "loss": 0.3882, + "step": 5590 + }, + { + "epoch": 4.0, + "grad_norm": 0.9914864897727966, + "learning_rate": 3.111111111111111e-05, + "loss": 0.3518, + "step": 5600 + }, + { + "epoch": 4.007142857142857, + "grad_norm": 1.6416865587234497, + "learning_rate": 3.107407407407408e-05, + "loss": 0.2688, + "step": 5610 + }, + { + "epoch": 4.014285714285714, + "grad_norm": 1.934449315071106, + "learning_rate": 3.103703703703704e-05, + "loss": 0.2385, + "step": 5620 + }, + { + "epoch": 4.021428571428571, + "grad_norm": 1.7663776874542236, + "learning_rate": 3.1e-05, + "loss": 0.3147, + "step": 5630 + }, + { + "epoch": 4.0285714285714285, + "grad_norm": 1.8457096815109253, + "learning_rate": 3.096296296296296e-05, + "loss": 0.2922, + "step": 5640 + }, + { + "epoch": 4.035714285714286, + "grad_norm": 1.133711338043213, + "learning_rate": 3.0925925925925924e-05, + "loss": 0.2291, + "step": 5650 + }, + { + "epoch": 4.042857142857143, + "grad_norm": 1.794723629951477, + "learning_rate": 3.088888888888889e-05, + "loss": 0.3204, + "step": 5660 + }, + { + "epoch": 4.05, + "grad_norm": 1.966180443763733, + "learning_rate": 3.0851851851851854e-05, + "loss": 0.2757, + "step": 5670 + }, + { + "epoch": 4.057142857142857, + "grad_norm": 0.789313018321991, + "learning_rate": 3.0814814814814816e-05, + "loss": 0.3106, + "step": 5680 + }, + { + "epoch": 4.064285714285714, + "grad_norm": 1.4390606880187988, + "learning_rate": 3.077777777777778e-05, + "loss": 0.192, + "step": 5690 + }, + { + "epoch": 4.071428571428571, + "grad_norm": 1.8229310512542725, + "learning_rate": 3.074074074074074e-05, + "loss": 0.3802, + "step": 5700 + }, + { + "epoch": 4.078571428571428, + "grad_norm": 1.3065968751907349, + "learning_rate": 3.070370370370371e-05, + "loss": 0.2891, + "step": 5710 + }, + { + "epoch": 4.085714285714285, + "grad_norm": 1.5169206857681274, + "learning_rate": 3.066666666666667e-05, + "loss": 0.2818, + "step": 5720 + }, + { + "epoch": 4.0928571428571425, + "grad_norm": 1.8811321258544922, + "learning_rate": 3.062962962962963e-05, + "loss": 0.1845, + "step": 5730 + }, + { + "epoch": 4.1, + "grad_norm": 2.2235770225524902, + "learning_rate": 3.059259259259259e-05, + "loss": 0.3671, + "step": 5740 + }, + { + "epoch": 4.107142857142857, + "grad_norm": 1.5675430297851562, + "learning_rate": 3.055555555555556e-05, + "loss": 0.3588, + "step": 5750 + }, + { + "epoch": 4.114285714285714, + "grad_norm": 1.3254741430282593, + "learning_rate": 3.0518518518518515e-05, + "loss": 0.3641, + "step": 5760 + }, + { + "epoch": 4.121428571428571, + "grad_norm": 2.601593017578125, + "learning_rate": 3.0481481481481484e-05, + "loss": 0.2704, + "step": 5770 + }, + { + "epoch": 4.128571428571428, + "grad_norm": 2.3631677627563477, + "learning_rate": 3.044444444444445e-05, + "loss": 0.2528, + "step": 5780 + }, + { + "epoch": 4.135714285714286, + "grad_norm": 1.4800968170166016, + "learning_rate": 3.0407407407407407e-05, + "loss": 0.263, + "step": 5790 + }, + { + "epoch": 4.142857142857143, + "grad_norm": 1.6989574432373047, + "learning_rate": 3.037037037037037e-05, + "loss": 0.2465, + "step": 5800 + }, + { + "epoch": 4.15, + "grad_norm": 1.595765471458435, + "learning_rate": 3.0333333333333337e-05, + "loss": 0.3223, + "step": 5810 + }, + { + "epoch": 4.1571428571428575, + "grad_norm": 1.8895677328109741, + "learning_rate": 3.02962962962963e-05, + "loss": 0.3181, + "step": 5820 + }, + { + "epoch": 4.164285714285715, + "grad_norm": 1.147406816482544, + "learning_rate": 3.025925925925926e-05, + "loss": 0.2275, + "step": 5830 + }, + { + "epoch": 4.171428571428572, + "grad_norm": 3.310147523880005, + "learning_rate": 3.0222222222222225e-05, + "loss": 0.3615, + "step": 5840 + }, + { + "epoch": 4.178571428571429, + "grad_norm": 1.6138179302215576, + "learning_rate": 3.018518518518519e-05, + "loss": 0.3492, + "step": 5850 + }, + { + "epoch": 4.185714285714286, + "grad_norm": 1.9912358522415161, + "learning_rate": 3.0148148148148148e-05, + "loss": 0.3358, + "step": 5860 + }, + { + "epoch": 4.192857142857143, + "grad_norm": 2.2521820068359375, + "learning_rate": 3.0111111111111113e-05, + "loss": 0.2773, + "step": 5870 + }, + { + "epoch": 4.2, + "grad_norm": 1.804829478263855, + "learning_rate": 3.0074074074074078e-05, + "loss": 0.3052, + "step": 5880 + }, + { + "epoch": 4.207142857142857, + "grad_norm": 1.0897246599197388, + "learning_rate": 3.0037037037037036e-05, + "loss": 0.3822, + "step": 5890 + }, + { + "epoch": 4.214285714285714, + "grad_norm": 1.337428331375122, + "learning_rate": 3e-05, + "loss": 0.3091, + "step": 5900 + }, + { + "epoch": 4.2214285714285715, + "grad_norm": 1.1409244537353516, + "learning_rate": 2.9962962962962966e-05, + "loss": 0.2002, + "step": 5910 + }, + { + "epoch": 4.228571428571429, + "grad_norm": 0.9190034866333008, + "learning_rate": 2.992592592592593e-05, + "loss": 0.3029, + "step": 5920 + }, + { + "epoch": 4.235714285714286, + "grad_norm": 1.7410012483596802, + "learning_rate": 2.988888888888889e-05, + "loss": 0.2361, + "step": 5930 + }, + { + "epoch": 4.242857142857143, + "grad_norm": 2.308295965194702, + "learning_rate": 2.9851851851851854e-05, + "loss": 0.3654, + "step": 5940 + }, + { + "epoch": 4.25, + "grad_norm": 1.299177646636963, + "learning_rate": 2.981481481481482e-05, + "loss": 0.2346, + "step": 5950 + }, + { + "epoch": 4.257142857142857, + "grad_norm": 1.0352667570114136, + "learning_rate": 2.9777777777777777e-05, + "loss": 0.2331, + "step": 5960 + }, + { + "epoch": 4.264285714285714, + "grad_norm": 1.0682189464569092, + "learning_rate": 2.9740740740740742e-05, + "loss": 0.2456, + "step": 5970 + }, + { + "epoch": 4.271428571428571, + "grad_norm": 1.536718487739563, + "learning_rate": 2.9703703703703707e-05, + "loss": 0.1908, + "step": 5980 + }, + { + "epoch": 4.2785714285714285, + "grad_norm": 2.0448334217071533, + "learning_rate": 2.9666666666666672e-05, + "loss": 0.3399, + "step": 5990 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 2.205901622772217, + "learning_rate": 2.962962962962963e-05, + "loss": 0.1913, + "step": 6000 + }, + { + "epoch": 4.285714285714286, + "eval_loss": 0.36299219727516174, + "eval_rouge1": 0.9025, + "eval_rouge2": 0.8402, + "eval_rougeL": 0.8994, + "eval_runtime": 122.2765, + "eval_samples_per_second": 11.449, + "eval_steps_per_second": 5.725, + "step": 6000 + }, + { + "epoch": 4.292857142857143, + "grad_norm": 1.455069661140442, + "learning_rate": 2.9592592592592595e-05, + "loss": 0.2236, + "step": 6010 + }, + { + "epoch": 4.3, + "grad_norm": 1.6218276023864746, + "learning_rate": 2.955555555555556e-05, + "loss": 0.2166, + "step": 6020 + }, + { + "epoch": 4.307142857142857, + "grad_norm": 1.4643278121948242, + "learning_rate": 2.9518518518518518e-05, + "loss": 0.2543, + "step": 6030 + }, + { + "epoch": 4.314285714285714, + "grad_norm": 1.9875061511993408, + "learning_rate": 2.9481481481481483e-05, + "loss": 0.275, + "step": 6040 + }, + { + "epoch": 4.321428571428571, + "grad_norm": 2.003077268600464, + "learning_rate": 2.9444444444444448e-05, + "loss": 0.3431, + "step": 6050 + }, + { + "epoch": 4.328571428571428, + "grad_norm": 1.332705020904541, + "learning_rate": 2.9407407407407413e-05, + "loss": 0.2546, + "step": 6060 + }, + { + "epoch": 4.335714285714285, + "grad_norm": 1.9161280393600464, + "learning_rate": 2.937037037037037e-05, + "loss": 0.2909, + "step": 6070 + }, + { + "epoch": 4.3428571428571425, + "grad_norm": 1.509238839149475, + "learning_rate": 2.9333333333333336e-05, + "loss": 0.253, + "step": 6080 + }, + { + "epoch": 4.35, + "grad_norm": 2.238847255706787, + "learning_rate": 2.92962962962963e-05, + "loss": 0.2717, + "step": 6090 + }, + { + "epoch": 4.357142857142857, + "grad_norm": 1.9578133821487427, + "learning_rate": 2.925925925925926e-05, + "loss": 0.3407, + "step": 6100 + }, + { + "epoch": 4.364285714285714, + "grad_norm": 1.805828332901001, + "learning_rate": 2.9222222222222224e-05, + "loss": 0.1811, + "step": 6110 + }, + { + "epoch": 4.371428571428572, + "grad_norm": 2.9014134407043457, + "learning_rate": 2.918518518518519e-05, + "loss": 0.3934, + "step": 6120 + }, + { + "epoch": 4.378571428571428, + "grad_norm": 1.9857615232467651, + "learning_rate": 2.914814814814815e-05, + "loss": 0.2026, + "step": 6130 + }, + { + "epoch": 4.385714285714286, + "grad_norm": 2.3884503841400146, + "learning_rate": 2.9111111111111112e-05, + "loss": 0.2787, + "step": 6140 + }, + { + "epoch": 4.392857142857143, + "grad_norm": 2.298215866088867, + "learning_rate": 2.9074074074074077e-05, + "loss": 0.2765, + "step": 6150 + }, + { + "epoch": 4.4, + "grad_norm": 2.1733076572418213, + "learning_rate": 2.9037037037037042e-05, + "loss": 0.3975, + "step": 6160 + }, + { + "epoch": 4.4071428571428575, + "grad_norm": 3.3003320693969727, + "learning_rate": 2.9e-05, + "loss": 0.4152, + "step": 6170 + }, + { + "epoch": 4.414285714285715, + "grad_norm": 1.5066970586776733, + "learning_rate": 2.8962962962962965e-05, + "loss": 0.345, + "step": 6180 + }, + { + "epoch": 4.421428571428572, + "grad_norm": 2.134096145629883, + "learning_rate": 2.892592592592593e-05, + "loss": 0.3154, + "step": 6190 + }, + { + "epoch": 4.428571428571429, + "grad_norm": 1.8306220769882202, + "learning_rate": 2.8888888888888888e-05, + "loss": 0.2908, + "step": 6200 + }, + { + "epoch": 4.435714285714286, + "grad_norm": 1.4300037622451782, + "learning_rate": 2.8851851851851853e-05, + "loss": 0.342, + "step": 6210 + }, + { + "epoch": 4.442857142857143, + "grad_norm": 1.6552793979644775, + "learning_rate": 2.8814814814814818e-05, + "loss": 0.2856, + "step": 6220 + }, + { + "epoch": 4.45, + "grad_norm": 2.188889265060425, + "learning_rate": 2.877777777777778e-05, + "loss": 0.25, + "step": 6230 + }, + { + "epoch": 4.457142857142857, + "grad_norm": 1.3003034591674805, + "learning_rate": 2.874074074074074e-05, + "loss": 0.2995, + "step": 6240 + }, + { + "epoch": 4.464285714285714, + "grad_norm": 1.834549903869629, + "learning_rate": 2.8703703703703706e-05, + "loss": 0.3726, + "step": 6250 + }, + { + "epoch": 4.4714285714285715, + "grad_norm": 1.9426199197769165, + "learning_rate": 2.8666666666666668e-05, + "loss": 0.2142, + "step": 6260 + }, + { + "epoch": 4.478571428571429, + "grad_norm": 1.5088646411895752, + "learning_rate": 2.862962962962963e-05, + "loss": 0.3584, + "step": 6270 + }, + { + "epoch": 4.485714285714286, + "grad_norm": 1.9997400045394897, + "learning_rate": 2.8592592592592594e-05, + "loss": 0.2402, + "step": 6280 + }, + { + "epoch": 4.492857142857143, + "grad_norm": 1.3831549882888794, + "learning_rate": 2.855555555555556e-05, + "loss": 0.312, + "step": 6290 + }, + { + "epoch": 4.5, + "grad_norm": 2.013425588607788, + "learning_rate": 2.851851851851852e-05, + "loss": 0.2728, + "step": 6300 + }, + { + "epoch": 4.507142857142857, + "grad_norm": 1.1200778484344482, + "learning_rate": 2.8481481481481482e-05, + "loss": 0.3909, + "step": 6310 + }, + { + "epoch": 4.514285714285714, + "grad_norm": 0.8029781579971313, + "learning_rate": 2.8444444444444447e-05, + "loss": 0.3491, + "step": 6320 + }, + { + "epoch": 4.521428571428571, + "grad_norm": 1.4999722242355347, + "learning_rate": 2.840740740740741e-05, + "loss": 0.2583, + "step": 6330 + }, + { + "epoch": 4.5285714285714285, + "grad_norm": 1.8954156637191772, + "learning_rate": 2.837037037037037e-05, + "loss": 0.3971, + "step": 6340 + }, + { + "epoch": 4.535714285714286, + "grad_norm": 1.5697578191757202, + "learning_rate": 2.8333333333333335e-05, + "loss": 0.3222, + "step": 6350 + }, + { + "epoch": 4.542857142857143, + "grad_norm": 0.9937646389007568, + "learning_rate": 2.8296296296296297e-05, + "loss": 0.3673, + "step": 6360 + }, + { + "epoch": 4.55, + "grad_norm": 1.935511589050293, + "learning_rate": 2.8259259259259262e-05, + "loss": 0.2385, + "step": 6370 + }, + { + "epoch": 4.557142857142857, + "grad_norm": 1.8132340908050537, + "learning_rate": 2.8222222222222223e-05, + "loss": 0.226, + "step": 6380 + }, + { + "epoch": 4.564285714285714, + "grad_norm": 0.8551497459411621, + "learning_rate": 2.8185185185185185e-05, + "loss": 0.3874, + "step": 6390 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 2.0115785598754883, + "learning_rate": 2.814814814814815e-05, + "loss": 0.2328, + "step": 6400 + }, + { + "epoch": 4.578571428571428, + "grad_norm": 1.0582072734832764, + "learning_rate": 2.811111111111111e-05, + "loss": 0.3523, + "step": 6410 + }, + { + "epoch": 4.585714285714285, + "grad_norm": 1.3484958410263062, + "learning_rate": 2.8074074074074076e-05, + "loss": 0.2867, + "step": 6420 + }, + { + "epoch": 4.5928571428571425, + "grad_norm": 1.4483561515808105, + "learning_rate": 2.8037037037037038e-05, + "loss": 0.2623, + "step": 6430 + }, + { + "epoch": 4.6, + "grad_norm": 2.2348268032073975, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.3953, + "step": 6440 + }, + { + "epoch": 4.607142857142857, + "grad_norm": 2.654326915740967, + "learning_rate": 2.7962962962962965e-05, + "loss": 0.3516, + "step": 6450 + }, + { + "epoch": 4.614285714285714, + "grad_norm": 0.8564252257347107, + "learning_rate": 2.7925925925925926e-05, + "loss": 0.2497, + "step": 6460 + }, + { + "epoch": 4.621428571428572, + "grad_norm": 2.7823233604431152, + "learning_rate": 2.788888888888889e-05, + "loss": 0.3975, + "step": 6470 + }, + { + "epoch": 4.628571428571428, + "grad_norm": 1.0915263891220093, + "learning_rate": 2.7851851851851853e-05, + "loss": 0.2574, + "step": 6480 + }, + { + "epoch": 4.635714285714286, + "grad_norm": 1.0459774732589722, + "learning_rate": 2.7814814814814814e-05, + "loss": 0.3426, + "step": 6490 + }, + { + "epoch": 4.642857142857143, + "grad_norm": 3.1720130443573, + "learning_rate": 2.777777777777778e-05, + "loss": 0.3155, + "step": 6500 + }, + { + "epoch": 4.65, + "grad_norm": 1.499185562133789, + "learning_rate": 2.774074074074074e-05, + "loss": 0.4515, + "step": 6510 + }, + { + "epoch": 4.6571428571428575, + "grad_norm": 2.4211909770965576, + "learning_rate": 2.7703703703703706e-05, + "loss": 0.2963, + "step": 6520 + }, + { + "epoch": 4.664285714285715, + "grad_norm": 2.167006492614746, + "learning_rate": 2.7666666666666667e-05, + "loss": 0.2625, + "step": 6530 + }, + { + "epoch": 4.671428571428572, + "grad_norm": 1.8955094814300537, + "learning_rate": 2.7629629629629632e-05, + "loss": 0.3374, + "step": 6540 + }, + { + "epoch": 4.678571428571429, + "grad_norm": 0.9967934489250183, + "learning_rate": 2.7592592592592594e-05, + "loss": 0.1611, + "step": 6550 + }, + { + "epoch": 4.685714285714286, + "grad_norm": 1.007778525352478, + "learning_rate": 2.7555555555555555e-05, + "loss": 0.2516, + "step": 6560 + }, + { + "epoch": 4.692857142857143, + "grad_norm": 2.9705958366394043, + "learning_rate": 2.751851851851852e-05, + "loss": 0.3893, + "step": 6570 + }, + { + "epoch": 4.7, + "grad_norm": 2.689723491668701, + "learning_rate": 2.7481481481481482e-05, + "loss": 0.2404, + "step": 6580 + }, + { + "epoch": 4.707142857142857, + "grad_norm": 2.095930337905884, + "learning_rate": 2.7444444444444443e-05, + "loss": 0.3239, + "step": 6590 + }, + { + "epoch": 4.714285714285714, + "grad_norm": 1.9235697984695435, + "learning_rate": 2.7407407407407408e-05, + "loss": 0.2779, + "step": 6600 + }, + { + "epoch": 4.7214285714285715, + "grad_norm": 3.329378843307495, + "learning_rate": 2.7370370370370373e-05, + "loss": 0.2791, + "step": 6610 + }, + { + "epoch": 4.728571428571429, + "grad_norm": 1.9044978618621826, + "learning_rate": 2.733333333333333e-05, + "loss": 0.3757, + "step": 6620 + }, + { + "epoch": 4.735714285714286, + "grad_norm": 2.207752227783203, + "learning_rate": 2.7296296296296296e-05, + "loss": 0.3391, + "step": 6630 + }, + { + "epoch": 4.742857142857143, + "grad_norm": 2.0488827228546143, + "learning_rate": 2.725925925925926e-05, + "loss": 0.396, + "step": 6640 + }, + { + "epoch": 4.75, + "grad_norm": 2.425340414047241, + "learning_rate": 2.7222222222222223e-05, + "loss": 0.2871, + "step": 6650 + }, + { + "epoch": 4.757142857142857, + "grad_norm": 1.9408286809921265, + "learning_rate": 2.7185185185185184e-05, + "loss": 0.3144, + "step": 6660 + }, + { + "epoch": 4.764285714285714, + "grad_norm": 1.864397406578064, + "learning_rate": 2.714814814814815e-05, + "loss": 0.2685, + "step": 6670 + }, + { + "epoch": 4.771428571428571, + "grad_norm": 1.1838607788085938, + "learning_rate": 2.7111111111111114e-05, + "loss": 0.2751, + "step": 6680 + }, + { + "epoch": 4.7785714285714285, + "grad_norm": 2.26408052444458, + "learning_rate": 2.7074074074074072e-05, + "loss": 0.3158, + "step": 6690 + }, + { + "epoch": 4.785714285714286, + "grad_norm": 2.007145404815674, + "learning_rate": 2.7037037037037037e-05, + "loss": 0.1969, + "step": 6700 + }, + { + "epoch": 4.792857142857143, + "grad_norm": 2.5209295749664307, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.3022, + "step": 6710 + }, + { + "epoch": 4.8, + "grad_norm": 2.3263044357299805, + "learning_rate": 2.696296296296296e-05, + "loss": 0.3799, + "step": 6720 + }, + { + "epoch": 4.807142857142857, + "grad_norm": 1.3880634307861328, + "learning_rate": 2.6925925925925925e-05, + "loss": 0.2829, + "step": 6730 + }, + { + "epoch": 4.814285714285714, + "grad_norm": 2.0264179706573486, + "learning_rate": 2.688888888888889e-05, + "loss": 0.2754, + "step": 6740 + }, + { + "epoch": 4.821428571428571, + "grad_norm": 1.6165140867233276, + "learning_rate": 2.6851851851851855e-05, + "loss": 0.3171, + "step": 6750 + }, + { + "epoch": 4.828571428571428, + "grad_norm": 1.6405526399612427, + "learning_rate": 2.6814814814814814e-05, + "loss": 0.4082, + "step": 6760 + }, + { + "epoch": 4.835714285714285, + "grad_norm": 1.6864060163497925, + "learning_rate": 2.677777777777778e-05, + "loss": 0.2026, + "step": 6770 + }, + { + "epoch": 4.8428571428571425, + "grad_norm": 1.4906965494155884, + "learning_rate": 2.6740740740740743e-05, + "loss": 0.2582, + "step": 6780 + }, + { + "epoch": 4.85, + "grad_norm": 1.2227530479431152, + "learning_rate": 2.67037037037037e-05, + "loss": 0.185, + "step": 6790 + }, + { + "epoch": 4.857142857142857, + "grad_norm": 1.2606697082519531, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.2651, + "step": 6800 + }, + { + "epoch": 4.864285714285714, + "grad_norm": 2.3722660541534424, + "learning_rate": 2.662962962962963e-05, + "loss": 0.2746, + "step": 6810 + }, + { + "epoch": 4.871428571428572, + "grad_norm": 1.8622608184814453, + "learning_rate": 2.659259259259259e-05, + "loss": 0.3473, + "step": 6820 + }, + { + "epoch": 4.878571428571428, + "grad_norm": 1.3814878463745117, + "learning_rate": 2.6555555555555555e-05, + "loss": 0.2706, + "step": 6830 + }, + { + "epoch": 4.885714285714286, + "grad_norm": 2.013650894165039, + "learning_rate": 2.651851851851852e-05, + "loss": 0.2802, + "step": 6840 + }, + { + "epoch": 4.892857142857143, + "grad_norm": 1.467282772064209, + "learning_rate": 2.6481481481481485e-05, + "loss": 0.3158, + "step": 6850 + }, + { + "epoch": 4.9, + "grad_norm": 1.3019797801971436, + "learning_rate": 2.6444444444444443e-05, + "loss": 0.2012, + "step": 6860 + }, + { + "epoch": 4.9071428571428575, + "grad_norm": 1.1120600700378418, + "learning_rate": 2.6407407407407408e-05, + "loss": 0.1385, + "step": 6870 + }, + { + "epoch": 4.914285714285715, + "grad_norm": 1.470406413078308, + "learning_rate": 2.6370370370370373e-05, + "loss": 0.3014, + "step": 6880 + }, + { + "epoch": 4.921428571428572, + "grad_norm": 2.237767457962036, + "learning_rate": 2.633333333333333e-05, + "loss": 0.2677, + "step": 6890 + }, + { + "epoch": 4.928571428571429, + "grad_norm": 1.3994693756103516, + "learning_rate": 2.6296296296296296e-05, + "loss": 0.4261, + "step": 6900 + }, + { + "epoch": 4.935714285714286, + "grad_norm": 2.21905517578125, + "learning_rate": 2.625925925925926e-05, + "loss": 0.3701, + "step": 6910 + }, + { + "epoch": 4.942857142857143, + "grad_norm": 2.8682186603546143, + "learning_rate": 2.6222222222222226e-05, + "loss": 0.4047, + "step": 6920 + }, + { + "epoch": 4.95, + "grad_norm": 1.9691041707992554, + "learning_rate": 2.6185185185185184e-05, + "loss": 0.2735, + "step": 6930 + }, + { + "epoch": 4.957142857142857, + "grad_norm": 1.7553354501724243, + "learning_rate": 2.614814814814815e-05, + "loss": 0.2381, + "step": 6940 + }, + { + "epoch": 4.964285714285714, + "grad_norm": 1.7930738925933838, + "learning_rate": 2.6111111111111114e-05, + "loss": 0.2838, + "step": 6950 + }, + { + "epoch": 4.9714285714285715, + "grad_norm": 2.4153687953948975, + "learning_rate": 2.6074074074074072e-05, + "loss": 0.4002, + "step": 6960 + }, + { + "epoch": 4.978571428571429, + "grad_norm": 1.392898678779602, + "learning_rate": 2.6037037037037037e-05, + "loss": 0.248, + "step": 6970 + }, + { + "epoch": 4.985714285714286, + "grad_norm": 1.7113401889801025, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.2871, + "step": 6980 + }, + { + "epoch": 4.992857142857143, + "grad_norm": 2.4877359867095947, + "learning_rate": 2.5962962962962967e-05, + "loss": 0.2443, + "step": 6990 + }, + { + "epoch": 5.0, + "grad_norm": 1.7225149869918823, + "learning_rate": 2.5925925925925925e-05, + "loss": 0.186, + "step": 7000 + }, + { + "epoch": 5.0, + "eval_loss": 0.3584790527820587, + "eval_rouge1": 0.9047, + "eval_rouge2": 0.8434, + "eval_rougeL": 0.9018, + "eval_runtime": 122.2903, + "eval_samples_per_second": 11.448, + "eval_steps_per_second": 5.724, + "step": 7000 + }, + { + "epoch": 5.007142857142857, + "grad_norm": 2.1430020332336426, + "learning_rate": 2.588888888888889e-05, + "loss": 0.3477, + "step": 7010 + }, + { + "epoch": 5.014285714285714, + "grad_norm": 0.958677351474762, + "learning_rate": 2.5851851851851855e-05, + "loss": 0.2474, + "step": 7020 + }, + { + "epoch": 5.021428571428571, + "grad_norm": 2.315269947052002, + "learning_rate": 2.5814814814814813e-05, + "loss": 0.2786, + "step": 7030 + }, + { + "epoch": 5.0285714285714285, + "grad_norm": 1.3595519065856934, + "learning_rate": 2.5777777777777778e-05, + "loss": 0.2286, + "step": 7040 + }, + { + "epoch": 5.035714285714286, + "grad_norm": 1.44675874710083, + "learning_rate": 2.5740740740740743e-05, + "loss": 0.2679, + "step": 7050 + }, + { + "epoch": 5.042857142857143, + "grad_norm": 1.754285454750061, + "learning_rate": 2.5703703703703708e-05, + "loss": 0.196, + "step": 7060 + }, + { + "epoch": 5.05, + "grad_norm": 2.9333369731903076, + "learning_rate": 2.5666666666666666e-05, + "loss": 0.1694, + "step": 7070 + }, + { + "epoch": 5.057142857142857, + "grad_norm": 2.6653859615325928, + "learning_rate": 2.562962962962963e-05, + "loss": 0.2642, + "step": 7080 + }, + { + "epoch": 5.064285714285714, + "grad_norm": 1.8362854719161987, + "learning_rate": 2.5592592592592596e-05, + "loss": 0.3614, + "step": 7090 + }, + { + "epoch": 5.071428571428571, + "grad_norm": 1.427701473236084, + "learning_rate": 2.5555555555555554e-05, + "loss": 0.2351, + "step": 7100 + }, + { + "epoch": 5.078571428571428, + "grad_norm": 2.3684027194976807, + "learning_rate": 2.551851851851852e-05, + "loss": 0.2803, + "step": 7110 + }, + { + "epoch": 5.085714285714285, + "grad_norm": 1.5823931694030762, + "learning_rate": 2.5481481481481484e-05, + "loss": 0.2749, + "step": 7120 + }, + { + "epoch": 5.0928571428571425, + "grad_norm": 1.6682019233703613, + "learning_rate": 2.5444444444444442e-05, + "loss": 0.3219, + "step": 7130 + }, + { + "epoch": 5.1, + "grad_norm": 1.7803760766983032, + "learning_rate": 2.5407407407407407e-05, + "loss": 0.2553, + "step": 7140 + }, + { + "epoch": 5.107142857142857, + "grad_norm": 1.945063591003418, + "learning_rate": 2.5370370370370372e-05, + "loss": 0.1739, + "step": 7150 + }, + { + "epoch": 5.114285714285714, + "grad_norm": 1.308371663093567, + "learning_rate": 2.5333333333333337e-05, + "loss": 0.2605, + "step": 7160 + }, + { + "epoch": 5.121428571428571, + "grad_norm": 1.906160593032837, + "learning_rate": 2.5296296296296295e-05, + "loss": 0.2071, + "step": 7170 + }, + { + "epoch": 5.128571428571428, + "grad_norm": 1.6239346265792847, + "learning_rate": 2.525925925925926e-05, + "loss": 0.2054, + "step": 7180 + }, + { + "epoch": 5.135714285714286, + "grad_norm": 1.6175967454910278, + "learning_rate": 2.5222222222222225e-05, + "loss": 0.2266, + "step": 7190 + }, + { + "epoch": 5.142857142857143, + "grad_norm": 1.938736915588379, + "learning_rate": 2.5185185185185183e-05, + "loss": 0.2932, + "step": 7200 + }, + { + "epoch": 5.15, + "grad_norm": 1.7323144674301147, + "learning_rate": 2.5148148148148148e-05, + "loss": 0.2762, + "step": 7210 + }, + { + "epoch": 5.1571428571428575, + "grad_norm": 1.859667181968689, + "learning_rate": 2.5111111111111113e-05, + "loss": 0.3213, + "step": 7220 + }, + { + "epoch": 5.164285714285715, + "grad_norm": 1.22067391872406, + "learning_rate": 2.5074074074074078e-05, + "loss": 0.2246, + "step": 7230 + }, + { + "epoch": 5.171428571428572, + "grad_norm": 0.9384840726852417, + "learning_rate": 2.5037037037037036e-05, + "loss": 0.3364, + "step": 7240 + }, + { + "epoch": 5.178571428571429, + "grad_norm": 1.4494845867156982, + "learning_rate": 2.5e-05, + "loss": 0.332, + "step": 7250 + }, + { + "epoch": 5.185714285714286, + "grad_norm": 2.3436357975006104, + "learning_rate": 2.4962962962962963e-05, + "loss": 0.1456, + "step": 7260 + }, + { + "epoch": 5.192857142857143, + "grad_norm": 1.0446144342422485, + "learning_rate": 2.4925925925925928e-05, + "loss": 0.1995, + "step": 7270 + }, + { + "epoch": 5.2, + "grad_norm": 2.325575113296509, + "learning_rate": 2.488888888888889e-05, + "loss": 0.3068, + "step": 7280 + }, + { + "epoch": 5.207142857142857, + "grad_norm": 2.100825309753418, + "learning_rate": 2.4851851851851854e-05, + "loss": 0.2659, + "step": 7290 + }, + { + "epoch": 5.214285714285714, + "grad_norm": 2.6580276489257812, + "learning_rate": 2.4814814814814816e-05, + "loss": 0.2872, + "step": 7300 + }, + { + "epoch": 5.2214285714285715, + "grad_norm": 2.505577564239502, + "learning_rate": 2.477777777777778e-05, + "loss": 0.2574, + "step": 7310 + }, + { + "epoch": 5.228571428571429, + "grad_norm": 1.4997559785842896, + "learning_rate": 2.4740740740740742e-05, + "loss": 0.2192, + "step": 7320 + }, + { + "epoch": 5.235714285714286, + "grad_norm": 1.9084120988845825, + "learning_rate": 2.4703703703703704e-05, + "loss": 0.2836, + "step": 7330 + }, + { + "epoch": 5.242857142857143, + "grad_norm": 1.1388484239578247, + "learning_rate": 2.466666666666667e-05, + "loss": 0.2426, + "step": 7340 + }, + { + "epoch": 5.25, + "grad_norm": 1.0559568405151367, + "learning_rate": 2.462962962962963e-05, + "loss": 0.344, + "step": 7350 + }, + { + "epoch": 5.257142857142857, + "grad_norm": 1.4024419784545898, + "learning_rate": 2.4592592592592595e-05, + "loss": 0.2121, + "step": 7360 + }, + { + "epoch": 5.264285714285714, + "grad_norm": 1.4338841438293457, + "learning_rate": 2.4555555555555557e-05, + "loss": 0.3329, + "step": 7370 + }, + { + "epoch": 5.271428571428571, + "grad_norm": 1.4188106060028076, + "learning_rate": 2.451851851851852e-05, + "loss": 0.2479, + "step": 7380 + }, + { + "epoch": 5.2785714285714285, + "grad_norm": 1.4320842027664185, + "learning_rate": 2.4481481481481483e-05, + "loss": 0.156, + "step": 7390 + }, + { + "epoch": 5.285714285714286, + "grad_norm": 3.022641181945801, + "learning_rate": 2.4444444444444445e-05, + "loss": 0.1962, + "step": 7400 + }, + { + "epoch": 5.292857142857143, + "grad_norm": 2.3267366886138916, + "learning_rate": 2.440740740740741e-05, + "loss": 0.2713, + "step": 7410 + }, + { + "epoch": 5.3, + "grad_norm": 2.685345411300659, + "learning_rate": 2.437037037037037e-05, + "loss": 0.3345, + "step": 7420 + }, + { + "epoch": 5.307142857142857, + "grad_norm": 0.9320240020751953, + "learning_rate": 2.4333333333333336e-05, + "loss": 0.3758, + "step": 7430 + }, + { + "epoch": 5.314285714285714, + "grad_norm": 1.8067562580108643, + "learning_rate": 2.4296296296296298e-05, + "loss": 0.2958, + "step": 7440 + }, + { + "epoch": 5.321428571428571, + "grad_norm": 1.5514296293258667, + "learning_rate": 2.425925925925926e-05, + "loss": 0.3268, + "step": 7450 + }, + { + "epoch": 5.328571428571428, + "grad_norm": 1.684311032295227, + "learning_rate": 2.4222222222222224e-05, + "loss": 0.2947, + "step": 7460 + }, + { + "epoch": 5.335714285714285, + "grad_norm": 2.0809545516967773, + "learning_rate": 2.4185185185185186e-05, + "loss": 0.2928, + "step": 7470 + }, + { + "epoch": 5.3428571428571425, + "grad_norm": 2.5362987518310547, + "learning_rate": 2.414814814814815e-05, + "loss": 0.1962, + "step": 7480 + }, + { + "epoch": 5.35, + "grad_norm": 0.636965274810791, + "learning_rate": 2.4111111111111113e-05, + "loss": 0.1694, + "step": 7490 + }, + { + "epoch": 5.357142857142857, + "grad_norm": 2.1662261486053467, + "learning_rate": 2.4074074074074074e-05, + "loss": 0.3111, + "step": 7500 + }, + { + "epoch": 5.364285714285714, + "grad_norm": 1.749324083328247, + "learning_rate": 2.403703703703704e-05, + "loss": 0.2521, + "step": 7510 + }, + { + "epoch": 5.371428571428572, + "grad_norm": 2.3572323322296143, + "learning_rate": 2.4e-05, + "loss": 0.1527, + "step": 7520 + }, + { + "epoch": 5.378571428571428, + "grad_norm": 1.274588942527771, + "learning_rate": 2.3962962962962966e-05, + "loss": 0.2757, + "step": 7530 + }, + { + "epoch": 5.385714285714286, + "grad_norm": 1.2197136878967285, + "learning_rate": 2.3925925925925927e-05, + "loss": 0.2288, + "step": 7540 + }, + { + "epoch": 5.392857142857143, + "grad_norm": 1.6061832904815674, + "learning_rate": 2.3888888888888892e-05, + "loss": 0.3292, + "step": 7550 + }, + { + "epoch": 5.4, + "grad_norm": 1.8271028995513916, + "learning_rate": 2.3851851851851854e-05, + "loss": 0.2392, + "step": 7560 + }, + { + "epoch": 5.4071428571428575, + "grad_norm": 1.8294018507003784, + "learning_rate": 2.3814814814814815e-05, + "loss": 0.2554, + "step": 7570 + }, + { + "epoch": 5.414285714285715, + "grad_norm": 1.253556728363037, + "learning_rate": 2.377777777777778e-05, + "loss": 0.2008, + "step": 7580 + }, + { + "epoch": 5.421428571428572, + "grad_norm": 1.1980758905410767, + "learning_rate": 2.3740740740740742e-05, + "loss": 0.265, + "step": 7590 + }, + { + "epoch": 5.428571428571429, + "grad_norm": 1.5337406396865845, + "learning_rate": 2.3703703703703707e-05, + "loss": 0.4126, + "step": 7600 + }, + { + "epoch": 5.435714285714286, + "grad_norm": 2.981381893157959, + "learning_rate": 2.3666666666666668e-05, + "loss": 0.3554, + "step": 7610 + }, + { + "epoch": 5.442857142857143, + "grad_norm": 1.927241325378418, + "learning_rate": 2.3629629629629633e-05, + "loss": 0.3148, + "step": 7620 + }, + { + "epoch": 5.45, + "grad_norm": 1.0788408517837524, + "learning_rate": 2.3592592592592595e-05, + "loss": 0.2421, + "step": 7630 + }, + { + "epoch": 5.457142857142857, + "grad_norm": 1.250436782836914, + "learning_rate": 2.3555555555555556e-05, + "loss": 0.2797, + "step": 7640 + }, + { + "epoch": 5.464285714285714, + "grad_norm": 1.2195000648498535, + "learning_rate": 2.351851851851852e-05, + "loss": 0.1702, + "step": 7650 + }, + { + "epoch": 5.4714285714285715, + "grad_norm": 1.773098349571228, + "learning_rate": 2.3481481481481483e-05, + "loss": 0.2383, + "step": 7660 + }, + { + "epoch": 5.478571428571429, + "grad_norm": 1.540499210357666, + "learning_rate": 2.3444444444444448e-05, + "loss": 0.2741, + "step": 7670 + }, + { + "epoch": 5.485714285714286, + "grad_norm": 1.3515613079071045, + "learning_rate": 2.340740740740741e-05, + "loss": 0.4365, + "step": 7680 + }, + { + "epoch": 5.492857142857143, + "grad_norm": 1.5094635486602783, + "learning_rate": 2.337037037037037e-05, + "loss": 0.2777, + "step": 7690 + }, + { + "epoch": 5.5, + "grad_norm": 1.123542070388794, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.3406, + "step": 7700 + }, + { + "epoch": 5.507142857142857, + "grad_norm": 1.0701942443847656, + "learning_rate": 2.3296296296296297e-05, + "loss": 0.2499, + "step": 7710 + }, + { + "epoch": 5.514285714285714, + "grad_norm": 1.270992636680603, + "learning_rate": 2.3259259259259262e-05, + "loss": 0.2044, + "step": 7720 + }, + { + "epoch": 5.521428571428571, + "grad_norm": 1.5586347579956055, + "learning_rate": 2.3222222222222224e-05, + "loss": 0.2573, + "step": 7730 + }, + { + "epoch": 5.5285714285714285, + "grad_norm": 0.9162809252738953, + "learning_rate": 2.318518518518519e-05, + "loss": 0.2245, + "step": 7740 + }, + { + "epoch": 5.535714285714286, + "grad_norm": 1.7767843008041382, + "learning_rate": 2.314814814814815e-05, + "loss": 0.276, + "step": 7750 + }, + { + "epoch": 5.542857142857143, + "grad_norm": 2.538541316986084, + "learning_rate": 2.3111111111111112e-05, + "loss": 0.3448, + "step": 7760 + }, + { + "epoch": 5.55, + "grad_norm": 1.5738705396652222, + "learning_rate": 2.3074074074074077e-05, + "loss": 0.3023, + "step": 7770 + }, + { + "epoch": 5.557142857142857, + "grad_norm": 0.9919751286506653, + "learning_rate": 2.303703703703704e-05, + "loss": 0.2979, + "step": 7780 + }, + { + "epoch": 5.564285714285714, + "grad_norm": 1.079817771911621, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.3141, + "step": 7790 + }, + { + "epoch": 5.571428571428571, + "grad_norm": 1.709007978439331, + "learning_rate": 2.2962962962962965e-05, + "loss": 0.2452, + "step": 7800 + }, + { + "epoch": 5.578571428571428, + "grad_norm": 0.9877552390098572, + "learning_rate": 2.2925925925925927e-05, + "loss": 0.2796, + "step": 7810 + }, + { + "epoch": 5.585714285714285, + "grad_norm": 1.9676953554153442, + "learning_rate": 2.288888888888889e-05, + "loss": 0.2314, + "step": 7820 + }, + { + "epoch": 5.5928571428571425, + "grad_norm": 1.778275966644287, + "learning_rate": 2.2851851851851853e-05, + "loss": 0.3033, + "step": 7830 + }, + { + "epoch": 5.6, + "grad_norm": 0.9746494889259338, + "learning_rate": 2.2814814814814818e-05, + "loss": 0.2459, + "step": 7840 + }, + { + "epoch": 5.607142857142857, + "grad_norm": 2.0238702297210693, + "learning_rate": 2.277777777777778e-05, + "loss": 0.2494, + "step": 7850 + }, + { + "epoch": 5.614285714285714, + "grad_norm": 1.2345530986785889, + "learning_rate": 2.2740740740740744e-05, + "loss": 0.4614, + "step": 7860 + }, + { + "epoch": 5.621428571428572, + "grad_norm": 0.9835256338119507, + "learning_rate": 2.2703703703703706e-05, + "loss": 0.3519, + "step": 7870 + }, + { + "epoch": 5.628571428571428, + "grad_norm": 1.9753897190093994, + "learning_rate": 2.2666666666666668e-05, + "loss": 0.2895, + "step": 7880 + }, + { + "epoch": 5.635714285714286, + "grad_norm": 1.7247217893600464, + "learning_rate": 2.2629629629629633e-05, + "loss": 0.1994, + "step": 7890 + }, + { + "epoch": 5.642857142857143, + "grad_norm": 1.8406201601028442, + "learning_rate": 2.2592592592592594e-05, + "loss": 0.1872, + "step": 7900 + }, + { + "epoch": 5.65, + "grad_norm": 1.4785393476486206, + "learning_rate": 2.255555555555556e-05, + "loss": 0.2811, + "step": 7910 + }, + { + "epoch": 5.6571428571428575, + "grad_norm": 2.23652982711792, + "learning_rate": 2.251851851851852e-05, + "loss": 0.3071, + "step": 7920 + }, + { + "epoch": 5.664285714285715, + "grad_norm": 1.9096837043762207, + "learning_rate": 2.2481481481481486e-05, + "loss": 0.2115, + "step": 7930 + }, + { + "epoch": 5.671428571428572, + "grad_norm": 2.0808775424957275, + "learning_rate": 2.2444444444444447e-05, + "loss": 0.3923, + "step": 7940 + }, + { + "epoch": 5.678571428571429, + "grad_norm": 1.5935535430908203, + "learning_rate": 2.240740740740741e-05, + "loss": 0.3461, + "step": 7950 + }, + { + "epoch": 5.685714285714286, + "grad_norm": 1.1959024667739868, + "learning_rate": 2.2370370370370374e-05, + "loss": 0.2016, + "step": 7960 + }, + { + "epoch": 5.692857142857143, + "grad_norm": 1.0776904821395874, + "learning_rate": 2.2333333333333335e-05, + "loss": 0.3476, + "step": 7970 + }, + { + "epoch": 5.7, + "grad_norm": 1.884531855583191, + "learning_rate": 2.2296296296296297e-05, + "loss": 0.2861, + "step": 7980 + }, + { + "epoch": 5.707142857142857, + "grad_norm": 1.2476330995559692, + "learning_rate": 2.2259259259259262e-05, + "loss": 0.2152, + "step": 7990 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 2.106348752975464, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.3022, + "step": 8000 + }, + { + "epoch": 5.714285714285714, + "eval_loss": 0.3492221236228943, + "eval_rouge1": 0.9062, + "eval_rouge2": 0.8456, + "eval_rougeL": 0.9033, + "eval_runtime": 122.1433, + "eval_samples_per_second": 11.462, + "eval_steps_per_second": 5.731, + "step": 8000 + }, + { + "epoch": 5.7214285714285715, + "grad_norm": 1.9454623460769653, + "learning_rate": 2.2185185185185188e-05, + "loss": 0.232, + "step": 8010 + }, + { + "epoch": 5.728571428571429, + "grad_norm": 1.9178905487060547, + "learning_rate": 2.214814814814815e-05, + "loss": 0.2278, + "step": 8020 + }, + { + "epoch": 5.735714285714286, + "grad_norm": 1.6279345750808716, + "learning_rate": 2.211111111111111e-05, + "loss": 0.2423, + "step": 8030 + }, + { + "epoch": 5.742857142857143, + "grad_norm": 2.7422447204589844, + "learning_rate": 2.2074074074074076e-05, + "loss": 0.3129, + "step": 8040 + }, + { + "epoch": 5.75, + "grad_norm": 1.7606775760650635, + "learning_rate": 2.2037037037037038e-05, + "loss": 0.217, + "step": 8050 + }, + { + "epoch": 5.757142857142857, + "grad_norm": 2.970276355743408, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.3246, + "step": 8060 + }, + { + "epoch": 5.764285714285714, + "grad_norm": 1.6729111671447754, + "learning_rate": 2.1962962962962964e-05, + "loss": 0.224, + "step": 8070 + }, + { + "epoch": 5.771428571428571, + "grad_norm": 2.103708267211914, + "learning_rate": 2.1925925925925926e-05, + "loss": 0.2256, + "step": 8080 + }, + { + "epoch": 5.7785714285714285, + "grad_norm": 1.7059235572814941, + "learning_rate": 2.188888888888889e-05, + "loss": 0.2986, + "step": 8090 + }, + { + "epoch": 5.785714285714286, + "grad_norm": 1.6239415407180786, + "learning_rate": 2.1851851851851852e-05, + "loss": 0.3007, + "step": 8100 + }, + { + "epoch": 5.792857142857143, + "grad_norm": 1.5316799879074097, + "learning_rate": 2.1814814814814817e-05, + "loss": 0.2295, + "step": 8110 + }, + { + "epoch": 5.8, + "grad_norm": 0.9283231496810913, + "learning_rate": 2.177777777777778e-05, + "loss": 0.1624, + "step": 8120 + }, + { + "epoch": 5.807142857142857, + "grad_norm": 1.4230540990829468, + "learning_rate": 2.174074074074074e-05, + "loss": 0.1686, + "step": 8130 + }, + { + "epoch": 5.814285714285714, + "grad_norm": 1.8694360256195068, + "learning_rate": 2.1703703703703705e-05, + "loss": 0.3416, + "step": 8140 + }, + { + "epoch": 5.821428571428571, + "grad_norm": 2.144221782684326, + "learning_rate": 2.1666666666666667e-05, + "loss": 0.2471, + "step": 8150 + }, + { + "epoch": 5.828571428571428, + "grad_norm": 2.5672965049743652, + "learning_rate": 2.162962962962963e-05, + "loss": 0.3354, + "step": 8160 + }, + { + "epoch": 5.835714285714285, + "grad_norm": 1.093578577041626, + "learning_rate": 2.1592592592592594e-05, + "loss": 0.262, + "step": 8170 + }, + { + "epoch": 5.8428571428571425, + "grad_norm": 0.7076272368431091, + "learning_rate": 2.1555555555555555e-05, + "loss": 0.254, + "step": 8180 + }, + { + "epoch": 5.85, + "grad_norm": 2.2301125526428223, + "learning_rate": 2.151851851851852e-05, + "loss": 0.1906, + "step": 8190 + }, + { + "epoch": 5.857142857142857, + "grad_norm": 1.704037070274353, + "learning_rate": 2.148148148148148e-05, + "loss": 0.2802, + "step": 8200 + }, + { + "epoch": 5.864285714285714, + "grad_norm": 1.4877769947052002, + "learning_rate": 2.1444444444444443e-05, + "loss": 0.3327, + "step": 8210 + }, + { + "epoch": 5.871428571428572, + "grad_norm": 1.436059594154358, + "learning_rate": 2.1407407407407408e-05, + "loss": 0.271, + "step": 8220 + }, + { + "epoch": 5.878571428571428, + "grad_norm": 1.357176661491394, + "learning_rate": 2.137037037037037e-05, + "loss": 0.2481, + "step": 8230 + }, + { + "epoch": 5.885714285714286, + "grad_norm": 1.846593976020813, + "learning_rate": 2.1333333333333335e-05, + "loss": 0.2641, + "step": 8240 + }, + { + "epoch": 5.892857142857143, + "grad_norm": 2.4631927013397217, + "learning_rate": 2.1296296296296296e-05, + "loss": 0.2832, + "step": 8250 + }, + { + "epoch": 5.9, + "grad_norm": 1.8715349435806274, + "learning_rate": 2.1259259259259258e-05, + "loss": 0.4157, + "step": 8260 + }, + { + "epoch": 5.9071428571428575, + "grad_norm": 2.3173437118530273, + "learning_rate": 2.1222222222222223e-05, + "loss": 0.353, + "step": 8270 + }, + { + "epoch": 5.914285714285715, + "grad_norm": 2.049422025680542, + "learning_rate": 2.1185185185185184e-05, + "loss": 0.2613, + "step": 8280 + }, + { + "epoch": 5.921428571428572, + "grad_norm": 1.281841516494751, + "learning_rate": 2.114814814814815e-05, + "loss": 0.2287, + "step": 8290 + }, + { + "epoch": 5.928571428571429, + "grad_norm": 1.007407546043396, + "learning_rate": 2.111111111111111e-05, + "loss": 0.2139, + "step": 8300 + }, + { + "epoch": 5.935714285714286, + "grad_norm": 1.8036701679229736, + "learning_rate": 2.1074074074074072e-05, + "loss": 0.2511, + "step": 8310 + }, + { + "epoch": 5.942857142857143, + "grad_norm": 0.9559861421585083, + "learning_rate": 2.1037037037037037e-05, + "loss": 0.3371, + "step": 8320 + }, + { + "epoch": 5.95, + "grad_norm": 2.136070489883423, + "learning_rate": 2.1e-05, + "loss": 0.2321, + "step": 8330 + }, + { + "epoch": 5.957142857142857, + "grad_norm": 1.2442055940628052, + "learning_rate": 2.0962962962962964e-05, + "loss": 0.1819, + "step": 8340 + }, + { + "epoch": 5.964285714285714, + "grad_norm": 2.0479979515075684, + "learning_rate": 2.0925925925925925e-05, + "loss": 0.3796, + "step": 8350 + }, + { + "epoch": 5.9714285714285715, + "grad_norm": 1.6974670886993408, + "learning_rate": 2.088888888888889e-05, + "loss": 0.1947, + "step": 8360 + }, + { + "epoch": 5.978571428571429, + "grad_norm": 2.1099231243133545, + "learning_rate": 2.0851851851851852e-05, + "loss": 0.1847, + "step": 8370 + }, + { + "epoch": 5.985714285714286, + "grad_norm": 1.9181057214736938, + "learning_rate": 2.0814814814814813e-05, + "loss": 0.3513, + "step": 8380 + }, + { + "epoch": 5.992857142857143, + "grad_norm": 1.0576838254928589, + "learning_rate": 2.077777777777778e-05, + "loss": 0.2663, + "step": 8390 + }, + { + "epoch": 6.0, + "grad_norm": 1.1283502578735352, + "learning_rate": 2.074074074074074e-05, + "loss": 0.2872, + "step": 8400 + }, + { + "epoch": 6.007142857142857, + "grad_norm": 0.7001394629478455, + "learning_rate": 2.0703703703703705e-05, + "loss": 0.277, + "step": 8410 + }, + { + "epoch": 6.014285714285714, + "grad_norm": 1.6374051570892334, + "learning_rate": 2.0666666666666666e-05, + "loss": 0.1849, + "step": 8420 + }, + { + "epoch": 6.021428571428571, + "grad_norm": 1.674914836883545, + "learning_rate": 2.0629629629629628e-05, + "loss": 0.1756, + "step": 8430 + }, + { + "epoch": 6.0285714285714285, + "grad_norm": 2.592038154602051, + "learning_rate": 2.0592592592592593e-05, + "loss": 0.3725, + "step": 8440 + }, + { + "epoch": 6.035714285714286, + "grad_norm": 2.942992925643921, + "learning_rate": 2.0555555555555555e-05, + "loss": 0.2529, + "step": 8450 + }, + { + "epoch": 6.042857142857143, + "grad_norm": 1.7580475807189941, + "learning_rate": 2.051851851851852e-05, + "loss": 0.1549, + "step": 8460 + }, + { + "epoch": 6.05, + "grad_norm": 1.9032413959503174, + "learning_rate": 2.048148148148148e-05, + "loss": 0.2529, + "step": 8470 + }, + { + "epoch": 6.057142857142857, + "grad_norm": 1.7678323984146118, + "learning_rate": 2.0444444444444446e-05, + "loss": 0.1935, + "step": 8480 + }, + { + "epoch": 6.064285714285714, + "grad_norm": 1.7014952898025513, + "learning_rate": 2.0407407407407408e-05, + "loss": 0.1965, + "step": 8490 + }, + { + "epoch": 6.071428571428571, + "grad_norm": 2.053157091140747, + "learning_rate": 2.037037037037037e-05, + "loss": 0.2045, + "step": 8500 + }, + { + "epoch": 6.078571428571428, + "grad_norm": 2.448059320449829, + "learning_rate": 2.0333333333333334e-05, + "loss": 0.2275, + "step": 8510 + }, + { + "epoch": 6.085714285714285, + "grad_norm": 1.3505144119262695, + "learning_rate": 2.0296296296296296e-05, + "loss": 0.192, + "step": 8520 + }, + { + "epoch": 6.0928571428571425, + "grad_norm": 1.0717148780822754, + "learning_rate": 2.025925925925926e-05, + "loss": 0.3017, + "step": 8530 + }, + { + "epoch": 6.1, + "grad_norm": 2.872880220413208, + "learning_rate": 2.0222222222222222e-05, + "loss": 0.2583, + "step": 8540 + }, + { + "epoch": 6.107142857142857, + "grad_norm": 1.559588074684143, + "learning_rate": 2.0185185185185187e-05, + "loss": 0.1557, + "step": 8550 + }, + { + "epoch": 6.114285714285714, + "grad_norm": 1.4375160932540894, + "learning_rate": 2.014814814814815e-05, + "loss": 0.1165, + "step": 8560 + }, + { + "epoch": 6.121428571428571, + "grad_norm": 1.1922268867492676, + "learning_rate": 2.011111111111111e-05, + "loss": 0.1995, + "step": 8570 + }, + { + "epoch": 6.128571428571428, + "grad_norm": 2.267056465148926, + "learning_rate": 2.0074074074074075e-05, + "loss": 0.2176, + "step": 8580 + }, + { + "epoch": 6.135714285714286, + "grad_norm": 1.5485496520996094, + "learning_rate": 2.0037037037037037e-05, + "loss": 0.206, + "step": 8590 + }, + { + "epoch": 6.142857142857143, + "grad_norm": 1.9538283348083496, + "learning_rate": 2e-05, + "loss": 0.3173, + "step": 8600 + }, + { + "epoch": 6.15, + "grad_norm": 2.8216044902801514, + "learning_rate": 1.9962962962962963e-05, + "loss": 0.3077, + "step": 8610 + }, + { + "epoch": 6.1571428571428575, + "grad_norm": 2.5293240547180176, + "learning_rate": 1.9925925925925925e-05, + "loss": 0.2829, + "step": 8620 + }, + { + "epoch": 6.164285714285715, + "grad_norm": 1.7947183847427368, + "learning_rate": 1.988888888888889e-05, + "loss": 0.3212, + "step": 8630 + }, + { + "epoch": 6.171428571428572, + "grad_norm": 1.541588544845581, + "learning_rate": 1.985185185185185e-05, + "loss": 0.1985, + "step": 8640 + }, + { + "epoch": 6.178571428571429, + "grad_norm": 1.286007046699524, + "learning_rate": 1.9814814814814816e-05, + "loss": 0.279, + "step": 8650 + }, + { + "epoch": 6.185714285714286, + "grad_norm": 1.8692234754562378, + "learning_rate": 1.9777777777777778e-05, + "loss": 0.303, + "step": 8660 + }, + { + "epoch": 6.192857142857143, + "grad_norm": 1.7906513214111328, + "learning_rate": 1.9740740740740743e-05, + "loss": 0.218, + "step": 8670 + }, + { + "epoch": 6.2, + "grad_norm": 2.0737709999084473, + "learning_rate": 1.9703703703703704e-05, + "loss": 0.1559, + "step": 8680 + }, + { + "epoch": 6.207142857142857, + "grad_norm": 1.8082749843597412, + "learning_rate": 1.9666666666666666e-05, + "loss": 0.2713, + "step": 8690 + }, + { + "epoch": 6.214285714285714, + "grad_norm": 1.8988617658615112, + "learning_rate": 1.962962962962963e-05, + "loss": 0.2362, + "step": 8700 + }, + { + "epoch": 6.2214285714285715, + "grad_norm": 0.8727281093597412, + "learning_rate": 1.9592592592592592e-05, + "loss": 0.2571, + "step": 8710 + }, + { + "epoch": 6.228571428571429, + "grad_norm": 1.0203776359558105, + "learning_rate": 1.9555555555555557e-05, + "loss": 0.2884, + "step": 8720 + }, + { + "epoch": 6.235714285714286, + "grad_norm": 1.5776811838150024, + "learning_rate": 1.951851851851852e-05, + "loss": 0.3115, + "step": 8730 + }, + { + "epoch": 6.242857142857143, + "grad_norm": 2.1000545024871826, + "learning_rate": 1.948148148148148e-05, + "loss": 0.2936, + "step": 8740 + }, + { + "epoch": 6.25, + "grad_norm": 1.991640329360962, + "learning_rate": 1.9444444444444445e-05, + "loss": 0.2214, + "step": 8750 + }, + { + "epoch": 6.257142857142857, + "grad_norm": 3.8238637447357178, + "learning_rate": 1.9407407407407407e-05, + "loss": 0.2738, + "step": 8760 + }, + { + "epoch": 6.264285714285714, + "grad_norm": 2.818711042404175, + "learning_rate": 1.9370370370370372e-05, + "loss": 0.3107, + "step": 8770 + }, + { + "epoch": 6.271428571428571, + "grad_norm": 1.4565989971160889, + "learning_rate": 1.9333333333333333e-05, + "loss": 0.1351, + "step": 8780 + }, + { + "epoch": 6.2785714285714285, + "grad_norm": 1.6833415031433105, + "learning_rate": 1.92962962962963e-05, + "loss": 0.3359, + "step": 8790 + }, + { + "epoch": 6.285714285714286, + "grad_norm": 3.662572145462036, + "learning_rate": 1.925925925925926e-05, + "loss": 0.2338, + "step": 8800 + }, + { + "epoch": 6.292857142857143, + "grad_norm": 1.9166165590286255, + "learning_rate": 1.922222222222222e-05, + "loss": 0.1908, + "step": 8810 + }, + { + "epoch": 6.3, + "grad_norm": 2.665553331375122, + "learning_rate": 1.9185185185185186e-05, + "loss": 0.246, + "step": 8820 + }, + { + "epoch": 6.307142857142857, + "grad_norm": 1.601194143295288, + "learning_rate": 1.9148148148148148e-05, + "loss": 0.2392, + "step": 8830 + }, + { + "epoch": 6.314285714285714, + "grad_norm": 1.7382382154464722, + "learning_rate": 1.9111111111111113e-05, + "loss": 0.2919, + "step": 8840 + }, + { + "epoch": 6.321428571428571, + "grad_norm": 1.0822237730026245, + "learning_rate": 1.9074074074074075e-05, + "loss": 0.1179, + "step": 8850 + }, + { + "epoch": 6.328571428571428, + "grad_norm": 1.9691376686096191, + "learning_rate": 1.903703703703704e-05, + "loss": 0.3934, + "step": 8860 + }, + { + "epoch": 6.335714285714285, + "grad_norm": 0.8395001292228699, + "learning_rate": 1.9e-05, + "loss": 0.2004, + "step": 8870 + }, + { + "epoch": 6.3428571428571425, + "grad_norm": 1.6967720985412598, + "learning_rate": 1.8962962962962963e-05, + "loss": 0.204, + "step": 8880 + }, + { + "epoch": 6.35, + "grad_norm": 1.2601035833358765, + "learning_rate": 1.8925925925925928e-05, + "loss": 0.2769, + "step": 8890 + }, + { + "epoch": 6.357142857142857, + "grad_norm": 1.560940146446228, + "learning_rate": 1.888888888888889e-05, + "loss": 0.1409, + "step": 8900 + }, + { + "epoch": 6.364285714285714, + "grad_norm": 1.645814061164856, + "learning_rate": 1.8851851851851854e-05, + "loss": 0.1914, + "step": 8910 + }, + { + "epoch": 6.371428571428572, + "grad_norm": 1.4886109828948975, + "learning_rate": 1.8814814814814816e-05, + "loss": 0.3517, + "step": 8920 + }, + { + "epoch": 6.378571428571428, + "grad_norm": 1.2002378702163696, + "learning_rate": 1.8777777777777777e-05, + "loss": 0.2346, + "step": 8930 + }, + { + "epoch": 6.385714285714286, + "grad_norm": 2.4492478370666504, + "learning_rate": 1.8740740740740742e-05, + "loss": 0.2104, + "step": 8940 + }, + { + "epoch": 6.392857142857143, + "grad_norm": 2.315610408782959, + "learning_rate": 1.8703703703703704e-05, + "loss": 0.2321, + "step": 8950 + }, + { + "epoch": 6.4, + "grad_norm": 2.116260528564453, + "learning_rate": 1.866666666666667e-05, + "loss": 0.2092, + "step": 8960 + }, + { + "epoch": 6.4071428571428575, + "grad_norm": 1.7362505197525024, + "learning_rate": 1.862962962962963e-05, + "loss": 0.2598, + "step": 8970 + }, + { + "epoch": 6.414285714285715, + "grad_norm": 2.1754469871520996, + "learning_rate": 1.8592592592592595e-05, + "loss": 0.3035, + "step": 8980 + }, + { + "epoch": 6.421428571428572, + "grad_norm": 1.448285698890686, + "learning_rate": 1.8555555555555557e-05, + "loss": 0.227, + "step": 8990 + }, + { + "epoch": 6.428571428571429, + "grad_norm": 1.888242483139038, + "learning_rate": 1.8518518518518518e-05, + "loss": 0.1618, + "step": 9000 + }, + { + "epoch": 6.428571428571429, + "eval_loss": 0.3434120714664459, + "eval_rouge1": 0.908, + "eval_rouge2": 0.8486, + "eval_rougeL": 0.9052, + "eval_runtime": 122.2937, + "eval_samples_per_second": 11.448, + "eval_steps_per_second": 5.724, + "step": 9000 + }, + { + "epoch": 6.435714285714286, + "grad_norm": 2.5552051067352295, + "learning_rate": 1.8481481481481483e-05, + "loss": 0.4376, + "step": 9010 + }, + { + "epoch": 6.442857142857143, + "grad_norm": 2.0973517894744873, + "learning_rate": 1.8444444444444445e-05, + "loss": 0.2163, + "step": 9020 + }, + { + "epoch": 6.45, + "grad_norm": 1.3774244785308838, + "learning_rate": 1.840740740740741e-05, + "loss": 0.14, + "step": 9030 + }, + { + "epoch": 6.457142857142857, + "grad_norm": 0.8735131025314331, + "learning_rate": 1.837037037037037e-05, + "loss": 0.1848, + "step": 9040 + }, + { + "epoch": 6.464285714285714, + "grad_norm": 1.5088914632797241, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.2889, + "step": 9050 + }, + { + "epoch": 6.4714285714285715, + "grad_norm": 1.0688769817352295, + "learning_rate": 1.8296296296296298e-05, + "loss": 0.1895, + "step": 9060 + }, + { + "epoch": 6.478571428571429, + "grad_norm": 1.43760085105896, + "learning_rate": 1.825925925925926e-05, + "loss": 0.2997, + "step": 9070 + }, + { + "epoch": 6.485714285714286, + "grad_norm": 1.1168969869613647, + "learning_rate": 1.8222222222222224e-05, + "loss": 0.3544, + "step": 9080 + }, + { + "epoch": 6.492857142857143, + "grad_norm": 1.7139670848846436, + "learning_rate": 1.8185185185185186e-05, + "loss": 0.2108, + "step": 9090 + }, + { + "epoch": 6.5, + "grad_norm": 1.2584503889083862, + "learning_rate": 1.814814814814815e-05, + "loss": 0.2791, + "step": 9100 + }, + { + "epoch": 6.507142857142857, + "grad_norm": 1.4440019130706787, + "learning_rate": 1.8111111111111112e-05, + "loss": 0.3745, + "step": 9110 + }, + { + "epoch": 6.514285714285714, + "grad_norm": 2.3828232288360596, + "learning_rate": 1.8074074074074074e-05, + "loss": 0.2159, + "step": 9120 + }, + { + "epoch": 6.521428571428571, + "grad_norm": 2.6553053855895996, + "learning_rate": 1.803703703703704e-05, + "loss": 0.3051, + "step": 9130 + }, + { + "epoch": 6.5285714285714285, + "grad_norm": 2.0669426918029785, + "learning_rate": 1.8e-05, + "loss": 0.2166, + "step": 9140 + }, + { + "epoch": 6.535714285714286, + "grad_norm": 1.4676064252853394, + "learning_rate": 1.7962962962962965e-05, + "loss": 0.2393, + "step": 9150 + }, + { + "epoch": 6.542857142857143, + "grad_norm": 1.5158963203430176, + "learning_rate": 1.7925925925925927e-05, + "loss": 0.2821, + "step": 9160 + }, + { + "epoch": 6.55, + "grad_norm": 1.438550591468811, + "learning_rate": 1.788888888888889e-05, + "loss": 0.229, + "step": 9170 + }, + { + "epoch": 6.557142857142857, + "grad_norm": 2.2161788940429688, + "learning_rate": 1.7851851851851853e-05, + "loss": 0.3705, + "step": 9180 + }, + { + "epoch": 6.564285714285714, + "grad_norm": 1.472321629524231, + "learning_rate": 1.7814814814814815e-05, + "loss": 0.3977, + "step": 9190 + }, + { + "epoch": 6.571428571428571, + "grad_norm": 1.957033395767212, + "learning_rate": 1.777777777777778e-05, + "loss": 0.2431, + "step": 9200 + }, + { + "epoch": 6.578571428571428, + "grad_norm": 3.070905923843384, + "learning_rate": 1.774074074074074e-05, + "loss": 0.2676, + "step": 9210 + }, + { + "epoch": 6.585714285714285, + "grad_norm": 2.240701198577881, + "learning_rate": 1.7703703703703706e-05, + "loss": 0.2346, + "step": 9220 + }, + { + "epoch": 6.5928571428571425, + "grad_norm": 1.2726478576660156, + "learning_rate": 1.7666666666666668e-05, + "loss": 0.2624, + "step": 9230 + }, + { + "epoch": 6.6, + "grad_norm": 2.543856382369995, + "learning_rate": 1.762962962962963e-05, + "loss": 0.3137, + "step": 9240 + }, + { + "epoch": 6.607142857142857, + "grad_norm": 2.1688966751098633, + "learning_rate": 1.7592592592592595e-05, + "loss": 0.3366, + "step": 9250 + }, + { + "epoch": 6.614285714285714, + "grad_norm": 1.9013522863388062, + "learning_rate": 1.7555555555555556e-05, + "loss": 0.1759, + "step": 9260 + }, + { + "epoch": 6.621428571428572, + "grad_norm": 2.7567338943481445, + "learning_rate": 1.751851851851852e-05, + "loss": 0.2615, + "step": 9270 + }, + { + "epoch": 6.628571428571428, + "grad_norm": 2.530351161956787, + "learning_rate": 1.7481481481481483e-05, + "loss": 0.363, + "step": 9280 + }, + { + "epoch": 6.635714285714286, + "grad_norm": 3.0051562786102295, + "learning_rate": 1.7444444444444448e-05, + "loss": 0.2155, + "step": 9290 + }, + { + "epoch": 6.642857142857143, + "grad_norm": 1.6199374198913574, + "learning_rate": 1.740740740740741e-05, + "loss": 0.1943, + "step": 9300 + }, + { + "epoch": 6.65, + "grad_norm": 2.2254199981689453, + "learning_rate": 1.737037037037037e-05, + "loss": 0.2086, + "step": 9310 + }, + { + "epoch": 6.6571428571428575, + "grad_norm": 1.4565106630325317, + "learning_rate": 1.7333333333333336e-05, + "loss": 0.2113, + "step": 9320 + }, + { + "epoch": 6.664285714285715, + "grad_norm": 1.8667312860488892, + "learning_rate": 1.7296296296296297e-05, + "loss": 0.1719, + "step": 9330 + }, + { + "epoch": 6.671428571428572, + "grad_norm": 2.0462963581085205, + "learning_rate": 1.7259259259259262e-05, + "loss": 0.2307, + "step": 9340 + }, + { + "epoch": 6.678571428571429, + "grad_norm": 1.5114613771438599, + "learning_rate": 1.7222222222222224e-05, + "loss": 0.2629, + "step": 9350 + }, + { + "epoch": 6.685714285714286, + "grad_norm": 1.8743935823440552, + "learning_rate": 1.7185185185185185e-05, + "loss": 0.2656, + "step": 9360 + }, + { + "epoch": 6.692857142857143, + "grad_norm": 1.6508034467697144, + "learning_rate": 1.714814814814815e-05, + "loss": 0.2971, + "step": 9370 + }, + { + "epoch": 6.7, + "grad_norm": 1.4109563827514648, + "learning_rate": 1.7111111111111112e-05, + "loss": 0.3155, + "step": 9380 + }, + { + "epoch": 6.707142857142857, + "grad_norm": 1.9742975234985352, + "learning_rate": 1.7074074074074077e-05, + "loss": 0.2858, + "step": 9390 + }, + { + "epoch": 6.714285714285714, + "grad_norm": 0.8593278527259827, + "learning_rate": 1.7037037037037038e-05, + "loss": 0.2484, + "step": 9400 + }, + { + "epoch": 6.7214285714285715, + "grad_norm": 1.8331007957458496, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.2763, + "step": 9410 + }, + { + "epoch": 6.728571428571429, + "grad_norm": 2.0606274604797363, + "learning_rate": 1.6962962962962965e-05, + "loss": 0.2016, + "step": 9420 + }, + { + "epoch": 6.735714285714286, + "grad_norm": 1.244935154914856, + "learning_rate": 1.6925925925925926e-05, + "loss": 0.2161, + "step": 9430 + }, + { + "epoch": 6.742857142857143, + "grad_norm": 2.0855889320373535, + "learning_rate": 1.688888888888889e-05, + "loss": 0.1961, + "step": 9440 + }, + { + "epoch": 6.75, + "grad_norm": 2.203310012817383, + "learning_rate": 1.6851851851851853e-05, + "loss": 0.1886, + "step": 9450 + }, + { + "epoch": 6.757142857142857, + "grad_norm": 2.1254501342773438, + "learning_rate": 1.6814814814814818e-05, + "loss": 0.2824, + "step": 9460 + }, + { + "epoch": 6.764285714285714, + "grad_norm": 1.498728632926941, + "learning_rate": 1.677777777777778e-05, + "loss": 0.2848, + "step": 9470 + }, + { + "epoch": 6.771428571428571, + "grad_norm": 2.6205763816833496, + "learning_rate": 1.674074074074074e-05, + "loss": 0.2728, + "step": 9480 + }, + { + "epoch": 6.7785714285714285, + "grad_norm": 1.6262216567993164, + "learning_rate": 1.6703703703703706e-05, + "loss": 0.4216, + "step": 9490 + }, + { + "epoch": 6.785714285714286, + "grad_norm": 3.074489116668701, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.2084, + "step": 9500 + }, + { + "epoch": 6.792857142857143, + "grad_norm": 1.8158230781555176, + "learning_rate": 1.6629629629629632e-05, + "loss": 0.1794, + "step": 9510 + }, + { + "epoch": 6.8, + "grad_norm": 2.069397449493408, + "learning_rate": 1.6592592592592594e-05, + "loss": 0.2363, + "step": 9520 + }, + { + "epoch": 6.807142857142857, + "grad_norm": 1.8637501001358032, + "learning_rate": 1.655555555555556e-05, + "loss": 0.2203, + "step": 9530 + }, + { + "epoch": 6.814285714285714, + "grad_norm": 2.043314218521118, + "learning_rate": 1.651851851851852e-05, + "loss": 0.2267, + "step": 9540 + }, + { + "epoch": 6.821428571428571, + "grad_norm": 2.8327081203460693, + "learning_rate": 1.6481481481481482e-05, + "loss": 0.2793, + "step": 9550 + }, + { + "epoch": 6.828571428571428, + "grad_norm": 2.3297407627105713, + "learning_rate": 1.6444444444444447e-05, + "loss": 0.2349, + "step": 9560 + }, + { + "epoch": 6.835714285714285, + "grad_norm": 0.9220748543739319, + "learning_rate": 1.640740740740741e-05, + "loss": 0.1966, + "step": 9570 + }, + { + "epoch": 6.8428571428571425, + "grad_norm": 1.5935183763504028, + "learning_rate": 1.6370370370370374e-05, + "loss": 0.3217, + "step": 9580 + }, + { + "epoch": 6.85, + "grad_norm": 0.9305605292320251, + "learning_rate": 1.6333333333333335e-05, + "loss": 0.1446, + "step": 9590 + }, + { + "epoch": 6.857142857142857, + "grad_norm": 2.0719094276428223, + "learning_rate": 1.62962962962963e-05, + "loss": 0.2195, + "step": 9600 + }, + { + "epoch": 6.864285714285714, + "grad_norm": 1.9230345487594604, + "learning_rate": 1.625925925925926e-05, + "loss": 0.2031, + "step": 9610 + }, + { + "epoch": 6.871428571428572, + "grad_norm": 1.7897018194198608, + "learning_rate": 1.6222222222222223e-05, + "loss": 0.1728, + "step": 9620 + }, + { + "epoch": 6.878571428571428, + "grad_norm": 2.4588770866394043, + "learning_rate": 1.6185185185185188e-05, + "loss": 0.3253, + "step": 9630 + }, + { + "epoch": 6.885714285714286, + "grad_norm": 1.2495237588882446, + "learning_rate": 1.614814814814815e-05, + "loss": 0.3539, + "step": 9640 + }, + { + "epoch": 6.892857142857143, + "grad_norm": 3.161078453063965, + "learning_rate": 1.6111111111111115e-05, + "loss": 0.3598, + "step": 9650 + }, + { + "epoch": 6.9, + "grad_norm": 1.9474009275436401, + "learning_rate": 1.6074074074074076e-05, + "loss": 0.1385, + "step": 9660 + }, + { + "epoch": 6.9071428571428575, + "grad_norm": 1.9687261581420898, + "learning_rate": 1.6037037037037038e-05, + "loss": 0.2375, + "step": 9670 + }, + { + "epoch": 6.914285714285715, + "grad_norm": 1.87405264377594, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.3374, + "step": 9680 + }, + { + "epoch": 6.921428571428572, + "grad_norm": 1.1928725242614746, + "learning_rate": 1.5962962962962964e-05, + "loss": 0.197, + "step": 9690 + }, + { + "epoch": 6.928571428571429, + "grad_norm": 1.6434850692749023, + "learning_rate": 1.5925925925925926e-05, + "loss": 0.2757, + "step": 9700 + }, + { + "epoch": 6.935714285714286, + "grad_norm": 1.1469305753707886, + "learning_rate": 1.588888888888889e-05, + "loss": 0.1793, + "step": 9710 + }, + { + "epoch": 6.942857142857143, + "grad_norm": 2.436051368713379, + "learning_rate": 1.5851851851851852e-05, + "loss": 0.3178, + "step": 9720 + }, + { + "epoch": 6.95, + "grad_norm": 1.8960529565811157, + "learning_rate": 1.5814814814814817e-05, + "loss": 0.2182, + "step": 9730 + }, + { + "epoch": 6.957142857142857, + "grad_norm": 1.755922794342041, + "learning_rate": 1.577777777777778e-05, + "loss": 0.3239, + "step": 9740 + }, + { + "epoch": 6.964285714285714, + "grad_norm": 2.202697515487671, + "learning_rate": 1.574074074074074e-05, + "loss": 0.2116, + "step": 9750 + }, + { + "epoch": 6.9714285714285715, + "grad_norm": 1.4491599798202515, + "learning_rate": 1.5703703703703705e-05, + "loss": 0.2329, + "step": 9760 + }, + { + "epoch": 6.978571428571429, + "grad_norm": 9.212343215942383, + "learning_rate": 1.5666666666666667e-05, + "loss": 0.2573, + "step": 9770 + }, + { + "epoch": 6.985714285714286, + "grad_norm": 1.2211856842041016, + "learning_rate": 1.5629629629629632e-05, + "loss": 0.2737, + "step": 9780 + }, + { + "epoch": 6.992857142857143, + "grad_norm": 1.59877347946167, + "learning_rate": 1.5592592592592593e-05, + "loss": 0.284, + "step": 9790 + }, + { + "epoch": 7.0, + "grad_norm": 2.295945882797241, + "learning_rate": 1.5555555555555555e-05, + "loss": 0.2076, + "step": 9800 + }, + { + "epoch": 7.007142857142857, + "grad_norm": 1.4388489723205566, + "learning_rate": 1.551851851851852e-05, + "loss": 0.2225, + "step": 9810 + }, + { + "epoch": 7.014285714285714, + "grad_norm": 1.9146931171417236, + "learning_rate": 1.548148148148148e-05, + "loss": 0.2917, + "step": 9820 + }, + { + "epoch": 7.021428571428571, + "grad_norm": 1.0212804079055786, + "learning_rate": 1.5444444444444446e-05, + "loss": 0.1537, + "step": 9830 + }, + { + "epoch": 7.0285714285714285, + "grad_norm": 2.146648645401001, + "learning_rate": 1.5407407407407408e-05, + "loss": 0.178, + "step": 9840 + }, + { + "epoch": 7.035714285714286, + "grad_norm": 2.4515628814697266, + "learning_rate": 1.537037037037037e-05, + "loss": 0.3043, + "step": 9850 + }, + { + "epoch": 7.042857142857143, + "grad_norm": 1.6906862258911133, + "learning_rate": 1.5333333333333334e-05, + "loss": 0.2787, + "step": 9860 + }, + { + "epoch": 7.05, + "grad_norm": 2.2019400596618652, + "learning_rate": 1.5296296296296296e-05, + "loss": 0.3236, + "step": 9870 + }, + { + "epoch": 7.057142857142857, + "grad_norm": 1.3307303190231323, + "learning_rate": 1.5259259259259258e-05, + "loss": 0.1875, + "step": 9880 + }, + { + "epoch": 7.064285714285714, + "grad_norm": 1.7358342409133911, + "learning_rate": 1.5222222222222224e-05, + "loss": 0.2149, + "step": 9890 + }, + { + "epoch": 7.071428571428571, + "grad_norm": 2.0298547744750977, + "learning_rate": 1.5185185185185186e-05, + "loss": 0.1876, + "step": 9900 + }, + { + "epoch": 7.078571428571428, + "grad_norm": 2.375779151916504, + "learning_rate": 1.514814814814815e-05, + "loss": 0.2289, + "step": 9910 + }, + { + "epoch": 7.085714285714285, + "grad_norm": 1.856911540031433, + "learning_rate": 1.5111111111111112e-05, + "loss": 0.2029, + "step": 9920 + }, + { + "epoch": 7.0928571428571425, + "grad_norm": 1.1523020267486572, + "learning_rate": 1.5074074074074074e-05, + "loss": 0.1753, + "step": 9930 + }, + { + "epoch": 7.1, + "grad_norm": 1.4677330255508423, + "learning_rate": 1.5037037037037039e-05, + "loss": 0.2256, + "step": 9940 + }, + { + "epoch": 7.107142857142857, + "grad_norm": 1.0742135047912598, + "learning_rate": 1.5e-05, + "loss": 0.3844, + "step": 9950 + }, + { + "epoch": 7.114285714285714, + "grad_norm": 1.4122258424758911, + "learning_rate": 1.4962962962962965e-05, + "loss": 0.1498, + "step": 9960 + }, + { + "epoch": 7.121428571428571, + "grad_norm": 1.9363057613372803, + "learning_rate": 1.4925925925925927e-05, + "loss": 0.2721, + "step": 9970 + }, + { + "epoch": 7.128571428571428, + "grad_norm": 0.7882018685340881, + "learning_rate": 1.4888888888888888e-05, + "loss": 0.2363, + "step": 9980 + }, + { + "epoch": 7.135714285714286, + "grad_norm": 1.8561784029006958, + "learning_rate": 1.4851851851851853e-05, + "loss": 0.2875, + "step": 9990 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 1.9597991704940796, + "learning_rate": 1.4814814814814815e-05, + "loss": 0.1984, + "step": 10000 + }, + { + "epoch": 7.142857142857143, + "eval_loss": 0.34164437651634216, + "eval_rouge1": 0.9089, + "eval_rouge2": 0.8501, + "eval_rougeL": 0.9063, + "eval_runtime": 122.2974, + "eval_samples_per_second": 11.448, + "eval_steps_per_second": 5.724, + "step": 10000 + }, + { + "epoch": 7.15, + "grad_norm": 2.1984336376190186, + "learning_rate": 1.477777777777778e-05, + "loss": 0.2409, + "step": 10010 + }, + { + "epoch": 7.1571428571428575, + "grad_norm": 1.1259089708328247, + "learning_rate": 1.4740740740740741e-05, + "loss": 0.1386, + "step": 10020 + }, + { + "epoch": 7.164285714285715, + "grad_norm": 2.6497113704681396, + "learning_rate": 1.4703703703703706e-05, + "loss": 0.2999, + "step": 10030 + }, + { + "epoch": 7.171428571428572, + "grad_norm": 2.7574968338012695, + "learning_rate": 1.4666666666666668e-05, + "loss": 0.2064, + "step": 10040 + }, + { + "epoch": 7.178571428571429, + "grad_norm": 2.4020519256591797, + "learning_rate": 1.462962962962963e-05, + "loss": 0.2539, + "step": 10050 + }, + { + "epoch": 7.185714285714286, + "grad_norm": 1.8728407621383667, + "learning_rate": 1.4592592592592594e-05, + "loss": 0.2264, + "step": 10060 + }, + { + "epoch": 7.192857142857143, + "grad_norm": 3.187389612197876, + "learning_rate": 1.4555555555555556e-05, + "loss": 0.2175, + "step": 10070 + }, + { + "epoch": 7.2, + "grad_norm": 1.6992945671081543, + "learning_rate": 1.4518518518518521e-05, + "loss": 0.1752, + "step": 10080 + }, + { + "epoch": 7.207142857142857, + "grad_norm": 1.2373261451721191, + "learning_rate": 1.4481481481481483e-05, + "loss": 0.1712, + "step": 10090 + }, + { + "epoch": 7.214285714285714, + "grad_norm": 1.3986244201660156, + "learning_rate": 1.4444444444444444e-05, + "loss": 0.1727, + "step": 10100 + }, + { + "epoch": 7.2214285714285715, + "grad_norm": 1.5018147230148315, + "learning_rate": 1.4407407407407409e-05, + "loss": 0.2309, + "step": 10110 + }, + { + "epoch": 7.228571428571429, + "grad_norm": 1.8186851739883423, + "learning_rate": 1.437037037037037e-05, + "loss": 0.2741, + "step": 10120 + }, + { + "epoch": 7.235714285714286, + "grad_norm": 2.8224360942840576, + "learning_rate": 1.4333333333333334e-05, + "loss": 0.2244, + "step": 10130 + }, + { + "epoch": 7.242857142857143, + "grad_norm": 1.900585412979126, + "learning_rate": 1.4296296296296297e-05, + "loss": 0.2527, + "step": 10140 + }, + { + "epoch": 7.25, + "grad_norm": 2.4210896492004395, + "learning_rate": 1.425925925925926e-05, + "loss": 0.3832, + "step": 10150 + }, + { + "epoch": 7.257142857142857, + "grad_norm": 1.2783209085464478, + "learning_rate": 1.4222222222222224e-05, + "loss": 0.1756, + "step": 10160 + }, + { + "epoch": 7.264285714285714, + "grad_norm": 1.4387212991714478, + "learning_rate": 1.4185185185185185e-05, + "loss": 0.2567, + "step": 10170 + }, + { + "epoch": 7.271428571428571, + "grad_norm": 2.861311435699463, + "learning_rate": 1.4148148148148148e-05, + "loss": 0.3683, + "step": 10180 + }, + { + "epoch": 7.2785714285714285, + "grad_norm": 0.8701191544532776, + "learning_rate": 1.4111111111111112e-05, + "loss": 0.2229, + "step": 10190 + }, + { + "epoch": 7.285714285714286, + "grad_norm": 2.103231430053711, + "learning_rate": 1.4074074074074075e-05, + "loss": 0.1652, + "step": 10200 + }, + { + "epoch": 7.292857142857143, + "grad_norm": 3.0958895683288574, + "learning_rate": 1.4037037037037038e-05, + "loss": 0.3048, + "step": 10210 + }, + { + "epoch": 7.3, + "grad_norm": 1.0370267629623413, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.243, + "step": 10220 + }, + { + "epoch": 7.307142857142857, + "grad_norm": 1.334799885749817, + "learning_rate": 1.3962962962962963e-05, + "loss": 0.2242, + "step": 10230 + }, + { + "epoch": 7.314285714285714, + "grad_norm": 1.797135829925537, + "learning_rate": 1.3925925925925926e-05, + "loss": 0.1916, + "step": 10240 + }, + { + "epoch": 7.321428571428571, + "grad_norm": 0.7588611841201782, + "learning_rate": 1.388888888888889e-05, + "loss": 0.2548, + "step": 10250 + }, + { + "epoch": 7.328571428571428, + "grad_norm": 1.9136377573013306, + "learning_rate": 1.3851851851851853e-05, + "loss": 0.2373, + "step": 10260 + }, + { + "epoch": 7.335714285714285, + "grad_norm": 1.326635479927063, + "learning_rate": 1.3814814814814816e-05, + "loss": 0.2597, + "step": 10270 + }, + { + "epoch": 7.3428571428571425, + "grad_norm": 2.400609016418457, + "learning_rate": 1.3777777777777778e-05, + "loss": 0.2648, + "step": 10280 + }, + { + "epoch": 7.35, + "grad_norm": 0.9898678064346313, + "learning_rate": 1.3740740740740741e-05, + "loss": 0.2304, + "step": 10290 + }, + { + "epoch": 7.357142857142857, + "grad_norm": 0.7826656103134155, + "learning_rate": 1.3703703703703704e-05, + "loss": 0.1209, + "step": 10300 + }, + { + "epoch": 7.364285714285714, + "grad_norm": 1.083044409751892, + "learning_rate": 1.3666666666666666e-05, + "loss": 0.2026, + "step": 10310 + }, + { + "epoch": 7.371428571428572, + "grad_norm": 1.283219814300537, + "learning_rate": 1.362962962962963e-05, + "loss": 0.2018, + "step": 10320 + }, + { + "epoch": 7.378571428571428, + "grad_norm": 1.9941823482513428, + "learning_rate": 1.3592592592592592e-05, + "loss": 0.2469, + "step": 10330 + }, + { + "epoch": 7.385714285714286, + "grad_norm": 2.553957462310791, + "learning_rate": 1.3555555555555557e-05, + "loss": 0.2661, + "step": 10340 + }, + { + "epoch": 7.392857142857143, + "grad_norm": 1.657182216644287, + "learning_rate": 1.3518518518518519e-05, + "loss": 0.2762, + "step": 10350 + }, + { + "epoch": 7.4, + "grad_norm": 1.6704496145248413, + "learning_rate": 1.348148148148148e-05, + "loss": 0.222, + "step": 10360 + }, + { + "epoch": 7.4071428571428575, + "grad_norm": 1.338329792022705, + "learning_rate": 1.3444444444444445e-05, + "loss": 0.2658, + "step": 10370 + }, + { + "epoch": 7.414285714285715, + "grad_norm": 1.9741250276565552, + "learning_rate": 1.3407407407407407e-05, + "loss": 0.2596, + "step": 10380 + }, + { + "epoch": 7.421428571428572, + "grad_norm": 2.523958444595337, + "learning_rate": 1.3370370370370372e-05, + "loss": 0.1553, + "step": 10390 + }, + { + "epoch": 7.428571428571429, + "grad_norm": 2.260690450668335, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.2942, + "step": 10400 + }, + { + "epoch": 7.435714285714286, + "grad_norm": 2.441620349884033, + "learning_rate": 1.3296296296296295e-05, + "loss": 0.2451, + "step": 10410 + }, + { + "epoch": 7.442857142857143, + "grad_norm": 0.7054124474525452, + "learning_rate": 1.325925925925926e-05, + "loss": 0.1862, + "step": 10420 + }, + { + "epoch": 7.45, + "grad_norm": 1.6281330585479736, + "learning_rate": 1.3222222222222221e-05, + "loss": 0.1714, + "step": 10430 + }, + { + "epoch": 7.457142857142857, + "grad_norm": 1.738685965538025, + "learning_rate": 1.3185185185185186e-05, + "loss": 0.2052, + "step": 10440 + }, + { + "epoch": 7.464285714285714, + "grad_norm": 1.9982494115829468, + "learning_rate": 1.3148148148148148e-05, + "loss": 0.2964, + "step": 10450 + }, + { + "epoch": 7.4714285714285715, + "grad_norm": 1.0081127882003784, + "learning_rate": 1.3111111111111113e-05, + "loss": 0.1956, + "step": 10460 + }, + { + "epoch": 7.478571428571429, + "grad_norm": 1.5927938222885132, + "learning_rate": 1.3074074074074074e-05, + "loss": 0.236, + "step": 10470 + }, + { + "epoch": 7.485714285714286, + "grad_norm": 1.7959505319595337, + "learning_rate": 1.3037037037037036e-05, + "loss": 0.2462, + "step": 10480 + }, + { + "epoch": 7.492857142857143, + "grad_norm": 0.9230768084526062, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.177, + "step": 10490 + }, + { + "epoch": 7.5, + "grad_norm": 1.8714969158172607, + "learning_rate": 1.2962962962962962e-05, + "loss": 0.2224, + "step": 10500 + }, + { + "epoch": 7.507142857142857, + "grad_norm": 2.809420585632324, + "learning_rate": 1.2925925925925927e-05, + "loss": 0.2214, + "step": 10510 + }, + { + "epoch": 7.514285714285714, + "grad_norm": 2.2183682918548584, + "learning_rate": 1.2888888888888889e-05, + "loss": 0.2528, + "step": 10520 + }, + { + "epoch": 7.521428571428571, + "grad_norm": 2.108675479888916, + "learning_rate": 1.2851851851851854e-05, + "loss": 0.2593, + "step": 10530 + }, + { + "epoch": 7.5285714285714285, + "grad_norm": 1.9557310342788696, + "learning_rate": 1.2814814814814815e-05, + "loss": 0.1875, + "step": 10540 + }, + { + "epoch": 7.535714285714286, + "grad_norm": 2.3365464210510254, + "learning_rate": 1.2777777777777777e-05, + "loss": 0.2232, + "step": 10550 + }, + { + "epoch": 7.542857142857143, + "grad_norm": 1.4713023900985718, + "learning_rate": 1.2740740740740742e-05, + "loss": 0.2592, + "step": 10560 + }, + { + "epoch": 7.55, + "grad_norm": 2.5241403579711914, + "learning_rate": 1.2703703703703704e-05, + "loss": 0.2632, + "step": 10570 + }, + { + "epoch": 7.557142857142857, + "grad_norm": 2.726618528366089, + "learning_rate": 1.2666666666666668e-05, + "loss": 0.2068, + "step": 10580 + }, + { + "epoch": 7.564285714285714, + "grad_norm": 1.2947627305984497, + "learning_rate": 1.262962962962963e-05, + "loss": 0.1846, + "step": 10590 + }, + { + "epoch": 7.571428571428571, + "grad_norm": 1.4739402532577515, + "learning_rate": 1.2592592592592592e-05, + "loss": 0.1445, + "step": 10600 + }, + { + "epoch": 7.578571428571428, + "grad_norm": 1.7607239484786987, + "learning_rate": 1.2555555555555557e-05, + "loss": 0.2087, + "step": 10610 + }, + { + "epoch": 7.585714285714285, + "grad_norm": 1.510556697845459, + "learning_rate": 1.2518518518518518e-05, + "loss": 0.2356, + "step": 10620 + }, + { + "epoch": 7.5928571428571425, + "grad_norm": 1.4189872741699219, + "learning_rate": 1.2481481481481481e-05, + "loss": 0.2343, + "step": 10630 + }, + { + "epoch": 7.6, + "grad_norm": 3.209477424621582, + "learning_rate": 1.2444444444444445e-05, + "loss": 0.2131, + "step": 10640 + }, + { + "epoch": 7.607142857142857, + "grad_norm": 2.026301145553589, + "learning_rate": 1.2407407407407408e-05, + "loss": 0.3643, + "step": 10650 + }, + { + "epoch": 7.614285714285714, + "grad_norm": 2.355459451675415, + "learning_rate": 1.2370370370370371e-05, + "loss": 0.2382, + "step": 10660 + }, + { + "epoch": 7.621428571428572, + "grad_norm": 1.6867364645004272, + "learning_rate": 1.2333333333333334e-05, + "loss": 0.1886, + "step": 10670 + }, + { + "epoch": 7.628571428571428, + "grad_norm": 1.863373041152954, + "learning_rate": 1.2296296296296298e-05, + "loss": 0.2842, + "step": 10680 + }, + { + "epoch": 7.635714285714286, + "grad_norm": 1.4037106037139893, + "learning_rate": 1.225925925925926e-05, + "loss": 0.297, + "step": 10690 + }, + { + "epoch": 7.642857142857143, + "grad_norm": 1.2220287322998047, + "learning_rate": 1.2222222222222222e-05, + "loss": 0.2126, + "step": 10700 + }, + { + "epoch": 7.65, + "grad_norm": 1.796430230140686, + "learning_rate": 1.2185185185185186e-05, + "loss": 0.2494, + "step": 10710 + }, + { + "epoch": 7.6571428571428575, + "grad_norm": 2.7764432430267334, + "learning_rate": 1.2148148148148149e-05, + "loss": 0.3102, + "step": 10720 + }, + { + "epoch": 7.664285714285715, + "grad_norm": 2.2875261306762695, + "learning_rate": 1.2111111111111112e-05, + "loss": 0.2171, + "step": 10730 + }, + { + "epoch": 7.671428571428572, + "grad_norm": 1.3017419576644897, + "learning_rate": 1.2074074074074075e-05, + "loss": 0.2095, + "step": 10740 + }, + { + "epoch": 7.678571428571429, + "grad_norm": 1.5423152446746826, + "learning_rate": 1.2037037037037037e-05, + "loss": 0.3183, + "step": 10750 + }, + { + "epoch": 7.685714285714286, + "grad_norm": 2.0346460342407227, + "learning_rate": 1.2e-05, + "loss": 0.1669, + "step": 10760 + }, + { + "epoch": 7.692857142857143, + "grad_norm": 1.178389549255371, + "learning_rate": 1.1962962962962964e-05, + "loss": 0.3195, + "step": 10770 + }, + { + "epoch": 7.7, + "grad_norm": 1.3902812004089355, + "learning_rate": 1.1925925925925927e-05, + "loss": 0.2166, + "step": 10780 + }, + { + "epoch": 7.707142857142857, + "grad_norm": 2.8894922733306885, + "learning_rate": 1.188888888888889e-05, + "loss": 0.2216, + "step": 10790 + }, + { + "epoch": 7.714285714285714, + "grad_norm": 2.77864670753479, + "learning_rate": 1.1851851851851853e-05, + "loss": 0.224, + "step": 10800 + }, + { + "epoch": 7.7214285714285715, + "grad_norm": 1.370814323425293, + "learning_rate": 1.1814814814814817e-05, + "loss": 0.2277, + "step": 10810 + }, + { + "epoch": 7.728571428571429, + "grad_norm": 1.3543068170547485, + "learning_rate": 1.1777777777777778e-05, + "loss": 0.19, + "step": 10820 + }, + { + "epoch": 7.735714285714286, + "grad_norm": 2.4707486629486084, + "learning_rate": 1.1740740740740741e-05, + "loss": 0.1951, + "step": 10830 + }, + { + "epoch": 7.742857142857143, + "grad_norm": 2.284876823425293, + "learning_rate": 1.1703703703703705e-05, + "loss": 0.2206, + "step": 10840 + }, + { + "epoch": 7.75, + "grad_norm": 1.1018098592758179, + "learning_rate": 1.1666666666666668e-05, + "loss": 0.1386, + "step": 10850 + }, + { + "epoch": 7.757142857142857, + "grad_norm": 1.9555597305297852, + "learning_rate": 1.1629629629629631e-05, + "loss": 0.1645, + "step": 10860 + }, + { + "epoch": 7.764285714285714, + "grad_norm": 1.1327388286590576, + "learning_rate": 1.1592592592592594e-05, + "loss": 0.1654, + "step": 10870 + }, + { + "epoch": 7.771428571428571, + "grad_norm": 1.6210479736328125, + "learning_rate": 1.1555555555555556e-05, + "loss": 0.1937, + "step": 10880 + }, + { + "epoch": 7.7785714285714285, + "grad_norm": 1.7142146825790405, + "learning_rate": 1.151851851851852e-05, + "loss": 0.1722, + "step": 10890 + }, + { + "epoch": 7.785714285714286, + "grad_norm": 3.529614210128784, + "learning_rate": 1.1481481481481482e-05, + "loss": 0.2414, + "step": 10900 + }, + { + "epoch": 7.792857142857143, + "grad_norm": 1.6331572532653809, + "learning_rate": 1.1444444444444446e-05, + "loss": 0.1844, + "step": 10910 + }, + { + "epoch": 7.8, + "grad_norm": 2.7431063652038574, + "learning_rate": 1.1407407407407409e-05, + "loss": 0.2745, + "step": 10920 + }, + { + "epoch": 7.807142857142857, + "grad_norm": 2.882291316986084, + "learning_rate": 1.1370370370370372e-05, + "loss": 0.2353, + "step": 10930 + }, + { + "epoch": 7.814285714285714, + "grad_norm": 2.3573696613311768, + "learning_rate": 1.1333333333333334e-05, + "loss": 0.2047, + "step": 10940 + }, + { + "epoch": 7.821428571428571, + "grad_norm": 1.370251178741455, + "learning_rate": 1.1296296296296297e-05, + "loss": 0.1855, + "step": 10950 + }, + { + "epoch": 7.828571428571428, + "grad_norm": 1.1445660591125488, + "learning_rate": 1.125925925925926e-05, + "loss": 0.2316, + "step": 10960 + }, + { + "epoch": 7.835714285714285, + "grad_norm": 2.094175100326538, + "learning_rate": 1.1222222222222224e-05, + "loss": 0.477, + "step": 10970 + }, + { + "epoch": 7.8428571428571425, + "grad_norm": 2.1148130893707275, + "learning_rate": 1.1185185185185187e-05, + "loss": 0.2581, + "step": 10980 + }, + { + "epoch": 7.85, + "grad_norm": 1.89934241771698, + "learning_rate": 1.1148148148148148e-05, + "loss": 0.2908, + "step": 10990 + }, + { + "epoch": 7.857142857142857, + "grad_norm": 1.6785616874694824, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.2222, + "step": 11000 + }, + { + "epoch": 7.857142857142857, + "eval_loss": 0.34114697575569153, + "eval_rouge1": 0.91, + "eval_rouge2": 0.8517, + "eval_rougeL": 0.9073, + "eval_runtime": 122.2351, + "eval_samples_per_second": 11.453, + "eval_steps_per_second": 5.727, + "step": 11000 + }, + { + "epoch": 7.864285714285714, + "grad_norm": 2.335857629776001, + "learning_rate": 1.1074074074074075e-05, + "loss": 0.262, + "step": 11010 + }, + { + "epoch": 7.871428571428572, + "grad_norm": 1.9699100255966187, + "learning_rate": 1.1037037037037038e-05, + "loss": 0.2056, + "step": 11020 + }, + { + "epoch": 7.878571428571428, + "grad_norm": 0.8576107025146484, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.1929, + "step": 11030 + }, + { + "epoch": 7.885714285714286, + "grad_norm": 0.7365075945854187, + "learning_rate": 1.0962962962962963e-05, + "loss": 0.2494, + "step": 11040 + }, + { + "epoch": 7.892857142857143, + "grad_norm": 2.7551143169403076, + "learning_rate": 1.0925925925925926e-05, + "loss": 0.3239, + "step": 11050 + }, + { + "epoch": 7.9, + "grad_norm": 1.1980743408203125, + "learning_rate": 1.088888888888889e-05, + "loss": 0.1836, + "step": 11060 + }, + { + "epoch": 7.9071428571428575, + "grad_norm": 1.525614619255066, + "learning_rate": 1.0851851851851853e-05, + "loss": 0.3089, + "step": 11070 + }, + { + "epoch": 7.914285714285715, + "grad_norm": 2.0727596282958984, + "learning_rate": 1.0814814814814814e-05, + "loss": 0.1867, + "step": 11080 + }, + { + "epoch": 7.921428571428572, + "grad_norm": 1.242550015449524, + "learning_rate": 1.0777777777777778e-05, + "loss": 0.222, + "step": 11090 + }, + { + "epoch": 7.928571428571429, + "grad_norm": 1.3312640190124512, + "learning_rate": 1.074074074074074e-05, + "loss": 0.3334, + "step": 11100 + }, + { + "epoch": 7.935714285714286, + "grad_norm": 1.4483474493026733, + "learning_rate": 1.0703703703703704e-05, + "loss": 0.2966, + "step": 11110 + }, + { + "epoch": 7.942857142857143, + "grad_norm": 1.5403432846069336, + "learning_rate": 1.0666666666666667e-05, + "loss": 0.2591, + "step": 11120 + }, + { + "epoch": 7.95, + "grad_norm": 1.6620547771453857, + "learning_rate": 1.0629629629629629e-05, + "loss": 0.2216, + "step": 11130 + }, + { + "epoch": 7.957142857142857, + "grad_norm": 1.2060527801513672, + "learning_rate": 1.0592592592592592e-05, + "loss": 0.2403, + "step": 11140 + }, + { + "epoch": 7.964285714285714, + "grad_norm": 1.4476624727249146, + "learning_rate": 1.0555555555555555e-05, + "loss": 0.2739, + "step": 11150 + }, + { + "epoch": 7.9714285714285715, + "grad_norm": 2.2425661087036133, + "learning_rate": 1.0518518518518519e-05, + "loss": 0.2981, + "step": 11160 + }, + { + "epoch": 7.978571428571429, + "grad_norm": 2.2100632190704346, + "learning_rate": 1.0481481481481482e-05, + "loss": 0.2178, + "step": 11170 + }, + { + "epoch": 7.985714285714286, + "grad_norm": 1.080759882926941, + "learning_rate": 1.0444444444444445e-05, + "loss": 0.2947, + "step": 11180 + }, + { + "epoch": 7.992857142857143, + "grad_norm": 2.9972569942474365, + "learning_rate": 1.0407407407407407e-05, + "loss": 0.203, + "step": 11190 + }, + { + "epoch": 8.0, + "grad_norm": 1.7628710269927979, + "learning_rate": 1.037037037037037e-05, + "loss": 0.308, + "step": 11200 + }, + { + "epoch": 8.007142857142858, + "grad_norm": 1.569732666015625, + "learning_rate": 1.0333333333333333e-05, + "loss": 0.2003, + "step": 11210 + }, + { + "epoch": 8.014285714285714, + "grad_norm": 0.8212767839431763, + "learning_rate": 1.0296296296296296e-05, + "loss": 0.1957, + "step": 11220 + }, + { + "epoch": 8.021428571428572, + "grad_norm": 1.7055811882019043, + "learning_rate": 1.025925925925926e-05, + "loss": 0.2274, + "step": 11230 + }, + { + "epoch": 8.028571428571428, + "grad_norm": 1.6140356063842773, + "learning_rate": 1.0222222222222223e-05, + "loss": 0.2182, + "step": 11240 + }, + { + "epoch": 8.035714285714286, + "grad_norm": 1.6154979467391968, + "learning_rate": 1.0185185185185185e-05, + "loss": 0.1217, + "step": 11250 + }, + { + "epoch": 8.042857142857143, + "grad_norm": 2.4283053874969482, + "learning_rate": 1.0148148148148148e-05, + "loss": 0.2687, + "step": 11260 + }, + { + "epoch": 8.05, + "grad_norm": 2.201909065246582, + "learning_rate": 1.0111111111111111e-05, + "loss": 0.2836, + "step": 11270 + }, + { + "epoch": 8.057142857142857, + "grad_norm": 1.902273178100586, + "learning_rate": 1.0074074074074074e-05, + "loss": 0.2076, + "step": 11280 + }, + { + "epoch": 8.064285714285715, + "grad_norm": 1.7851389646530151, + "learning_rate": 1.0037037037037038e-05, + "loss": 0.2045, + "step": 11290 + }, + { + "epoch": 8.071428571428571, + "grad_norm": 1.1988000869750977, + "learning_rate": 1e-05, + "loss": 0.1832, + "step": 11300 + }, + { + "epoch": 8.07857142857143, + "grad_norm": 0.6530731320381165, + "learning_rate": 9.962962962962962e-06, + "loss": 0.1788, + "step": 11310 + }, + { + "epoch": 8.085714285714285, + "grad_norm": 1.5712918043136597, + "learning_rate": 9.925925925925926e-06, + "loss": 0.1601, + "step": 11320 + }, + { + "epoch": 8.092857142857143, + "grad_norm": 1.3664653301239014, + "learning_rate": 9.888888888888889e-06, + "loss": 0.2745, + "step": 11330 + }, + { + "epoch": 8.1, + "grad_norm": 1.084404706954956, + "learning_rate": 9.851851851851852e-06, + "loss": 0.2595, + "step": 11340 + }, + { + "epoch": 8.107142857142858, + "grad_norm": 1.9523823261260986, + "learning_rate": 9.814814814814815e-06, + "loss": 0.2105, + "step": 11350 + }, + { + "epoch": 8.114285714285714, + "grad_norm": 1.2386913299560547, + "learning_rate": 9.777777777777779e-06, + "loss": 0.1199, + "step": 11360 + }, + { + "epoch": 8.121428571428572, + "grad_norm": 1.6026146411895752, + "learning_rate": 9.74074074074074e-06, + "loss": 0.2419, + "step": 11370 + }, + { + "epoch": 8.128571428571428, + "grad_norm": 1.3624472618103027, + "learning_rate": 9.703703703703703e-06, + "loss": 0.2094, + "step": 11380 + }, + { + "epoch": 8.135714285714286, + "grad_norm": 1.7777026891708374, + "learning_rate": 9.666666666666667e-06, + "loss": 0.2033, + "step": 11390 + }, + { + "epoch": 8.142857142857142, + "grad_norm": 1.5583858489990234, + "learning_rate": 9.62962962962963e-06, + "loss": 0.2229, + "step": 11400 + }, + { + "epoch": 8.15, + "grad_norm": 0.7537804841995239, + "learning_rate": 9.592592592592593e-06, + "loss": 0.2201, + "step": 11410 + }, + { + "epoch": 8.157142857142857, + "grad_norm": 1.3313623666763306, + "learning_rate": 9.555555555555556e-06, + "loss": 0.1802, + "step": 11420 + }, + { + "epoch": 8.164285714285715, + "grad_norm": 2.136382818222046, + "learning_rate": 9.51851851851852e-06, + "loss": 0.1686, + "step": 11430 + }, + { + "epoch": 8.17142857142857, + "grad_norm": 1.4156885147094727, + "learning_rate": 9.481481481481481e-06, + "loss": 0.2291, + "step": 11440 + }, + { + "epoch": 8.178571428571429, + "grad_norm": 1.5296056270599365, + "learning_rate": 9.444444444444445e-06, + "loss": 0.2476, + "step": 11450 + }, + { + "epoch": 8.185714285714285, + "grad_norm": 1.5632902383804321, + "learning_rate": 9.407407407407408e-06, + "loss": 0.2304, + "step": 11460 + }, + { + "epoch": 8.192857142857143, + "grad_norm": 0.9542272686958313, + "learning_rate": 9.370370370370371e-06, + "loss": 0.1693, + "step": 11470 + }, + { + "epoch": 8.2, + "grad_norm": 3.356255292892456, + "learning_rate": 9.333333333333334e-06, + "loss": 0.3996, + "step": 11480 + }, + { + "epoch": 8.207142857142857, + "grad_norm": 1.6759045124053955, + "learning_rate": 9.296296296296298e-06, + "loss": 0.2134, + "step": 11490 + }, + { + "epoch": 8.214285714285714, + "grad_norm": 1.2791472673416138, + "learning_rate": 9.259259259259259e-06, + "loss": 0.248, + "step": 11500 + }, + { + "epoch": 8.221428571428572, + "grad_norm": 1.07367742061615, + "learning_rate": 9.222222222222222e-06, + "loss": 0.1731, + "step": 11510 + }, + { + "epoch": 8.228571428571428, + "grad_norm": 0.5693773627281189, + "learning_rate": 9.185185185185186e-06, + "loss": 0.2758, + "step": 11520 + }, + { + "epoch": 8.235714285714286, + "grad_norm": 1.7820035219192505, + "learning_rate": 9.148148148148149e-06, + "loss": 0.2147, + "step": 11530 + }, + { + "epoch": 8.242857142857142, + "grad_norm": 1.6593891382217407, + "learning_rate": 9.111111111111112e-06, + "loss": 0.2481, + "step": 11540 + }, + { + "epoch": 8.25, + "grad_norm": 1.4112298488616943, + "learning_rate": 9.074074074074075e-06, + "loss": 0.3307, + "step": 11550 + }, + { + "epoch": 8.257142857142856, + "grad_norm": 1.0225239992141724, + "learning_rate": 9.037037037037037e-06, + "loss": 0.1092, + "step": 11560 + }, + { + "epoch": 8.264285714285714, + "grad_norm": 1.4713934659957886, + "learning_rate": 9e-06, + "loss": 0.2332, + "step": 11570 + }, + { + "epoch": 8.271428571428572, + "grad_norm": 1.6922743320465088, + "learning_rate": 8.962962962962963e-06, + "loss": 0.2106, + "step": 11580 + }, + { + "epoch": 8.278571428571428, + "grad_norm": 2.4201695919036865, + "learning_rate": 8.925925925925927e-06, + "loss": 0.1746, + "step": 11590 + }, + { + "epoch": 8.285714285714286, + "grad_norm": 2.3649351596832275, + "learning_rate": 8.88888888888889e-06, + "loss": 0.2204, + "step": 11600 + }, + { + "epoch": 8.292857142857143, + "grad_norm": 1.0867241621017456, + "learning_rate": 8.851851851851853e-06, + "loss": 0.2246, + "step": 11610 + }, + { + "epoch": 8.3, + "grad_norm": 1.1103533506393433, + "learning_rate": 8.814814814814815e-06, + "loss": 0.2952, + "step": 11620 + }, + { + "epoch": 8.307142857142857, + "grad_norm": 1.9086233377456665, + "learning_rate": 8.777777777777778e-06, + "loss": 0.1627, + "step": 11630 + }, + { + "epoch": 8.314285714285715, + "grad_norm": 1.5733546018600464, + "learning_rate": 8.740740740740741e-06, + "loss": 0.2493, + "step": 11640 + }, + { + "epoch": 8.321428571428571, + "grad_norm": 1.514758586883545, + "learning_rate": 8.703703703703705e-06, + "loss": 0.1708, + "step": 11650 + }, + { + "epoch": 8.32857142857143, + "grad_norm": 4.691562175750732, + "learning_rate": 8.666666666666668e-06, + "loss": 0.2622, + "step": 11660 + }, + { + "epoch": 8.335714285714285, + "grad_norm": 1.0987350940704346, + "learning_rate": 8.629629629629631e-06, + "loss": 0.1291, + "step": 11670 + }, + { + "epoch": 8.342857142857143, + "grad_norm": 1.4016727209091187, + "learning_rate": 8.592592592592593e-06, + "loss": 0.2075, + "step": 11680 + }, + { + "epoch": 8.35, + "grad_norm": 0.69717937707901, + "learning_rate": 8.555555555555556e-06, + "loss": 0.1609, + "step": 11690 + }, + { + "epoch": 8.357142857142858, + "grad_norm": 2.023461103439331, + "learning_rate": 8.518518518518519e-06, + "loss": 0.235, + "step": 11700 + }, + { + "epoch": 8.364285714285714, + "grad_norm": 2.0078303813934326, + "learning_rate": 8.481481481481482e-06, + "loss": 0.2081, + "step": 11710 + }, + { + "epoch": 8.371428571428572, + "grad_norm": 1.4724724292755127, + "learning_rate": 8.444444444444446e-06, + "loss": 0.2148, + "step": 11720 + }, + { + "epoch": 8.378571428571428, + "grad_norm": 2.9564125537872314, + "learning_rate": 8.407407407407409e-06, + "loss": 0.2295, + "step": 11730 + }, + { + "epoch": 8.385714285714286, + "grad_norm": 1.4059520959854126, + "learning_rate": 8.37037037037037e-06, + "loss": 0.1398, + "step": 11740 + }, + { + "epoch": 8.392857142857142, + "grad_norm": 2.894953966140747, + "learning_rate": 8.333333333333334e-06, + "loss": 0.2699, + "step": 11750 + }, + { + "epoch": 8.4, + "grad_norm": 2.5017454624176025, + "learning_rate": 8.296296296296297e-06, + "loss": 0.2322, + "step": 11760 + }, + { + "epoch": 8.407142857142857, + "grad_norm": 0.9503372311592102, + "learning_rate": 8.25925925925926e-06, + "loss": 0.1356, + "step": 11770 + }, + { + "epoch": 8.414285714285715, + "grad_norm": 1.63711416721344, + "learning_rate": 8.222222222222223e-06, + "loss": 0.1929, + "step": 11780 + }, + { + "epoch": 8.42142857142857, + "grad_norm": 0.5972274541854858, + "learning_rate": 8.185185185185187e-06, + "loss": 0.2533, + "step": 11790 + }, + { + "epoch": 8.428571428571429, + "grad_norm": 1.0893709659576416, + "learning_rate": 8.14814814814815e-06, + "loss": 0.2089, + "step": 11800 + }, + { + "epoch": 8.435714285714285, + "grad_norm": 1.5523369312286377, + "learning_rate": 8.111111111111112e-06, + "loss": 0.1589, + "step": 11810 + }, + { + "epoch": 8.442857142857143, + "grad_norm": 1.5510472059249878, + "learning_rate": 8.074074074074075e-06, + "loss": 0.2162, + "step": 11820 + }, + { + "epoch": 8.45, + "grad_norm": 2.1272058486938477, + "learning_rate": 8.037037037037038e-06, + "loss": 0.2172, + "step": 11830 + }, + { + "epoch": 8.457142857142857, + "grad_norm": 2.2862300872802734, + "learning_rate": 8.000000000000001e-06, + "loss": 0.2341, + "step": 11840 + }, + { + "epoch": 8.464285714285714, + "grad_norm": 1.126405954360962, + "learning_rate": 7.962962962962963e-06, + "loss": 0.1474, + "step": 11850 + }, + { + "epoch": 8.471428571428572, + "grad_norm": 2.000462293624878, + "learning_rate": 7.925925925925926e-06, + "loss": 0.2948, + "step": 11860 + }, + { + "epoch": 8.478571428571428, + "grad_norm": 1.4239530563354492, + "learning_rate": 7.88888888888889e-06, + "loss": 0.2267, + "step": 11870 + }, + { + "epoch": 8.485714285714286, + "grad_norm": 2.479355812072754, + "learning_rate": 7.851851851851853e-06, + "loss": 0.1857, + "step": 11880 + }, + { + "epoch": 8.492857142857144, + "grad_norm": 1.9741036891937256, + "learning_rate": 7.814814814814816e-06, + "loss": 0.2049, + "step": 11890 + }, + { + "epoch": 8.5, + "grad_norm": 1.996099591255188, + "learning_rate": 7.777777777777777e-06, + "loss": 0.2404, + "step": 11900 + }, + { + "epoch": 8.507142857142856, + "grad_norm": 1.185811161994934, + "learning_rate": 7.74074074074074e-06, + "loss": 0.1476, + "step": 11910 + }, + { + "epoch": 8.514285714285714, + "grad_norm": 1.7766746282577515, + "learning_rate": 7.703703703703704e-06, + "loss": 0.1483, + "step": 11920 + }, + { + "epoch": 8.521428571428572, + "grad_norm": 1.0609338283538818, + "learning_rate": 7.666666666666667e-06, + "loss": 0.3414, + "step": 11930 + }, + { + "epoch": 8.528571428571428, + "grad_norm": 1.352981448173523, + "learning_rate": 7.629629629629629e-06, + "loss": 0.2453, + "step": 11940 + }, + { + "epoch": 8.535714285714286, + "grad_norm": 2.01450252532959, + "learning_rate": 7.592592592592593e-06, + "loss": 0.0983, + "step": 11950 + }, + { + "epoch": 8.542857142857143, + "grad_norm": 0.9895955920219421, + "learning_rate": 7.555555555555556e-06, + "loss": 0.2821, + "step": 11960 + }, + { + "epoch": 8.55, + "grad_norm": 2.1095712184906006, + "learning_rate": 7.518518518518519e-06, + "loss": 0.2147, + "step": 11970 + }, + { + "epoch": 8.557142857142857, + "grad_norm": 1.3148187398910522, + "learning_rate": 7.481481481481483e-06, + "loss": 0.1834, + "step": 11980 + }, + { + "epoch": 8.564285714285715, + "grad_norm": 2.1209769248962402, + "learning_rate": 7.444444444444444e-06, + "loss": 0.2369, + "step": 11990 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 2.559124708175659, + "learning_rate": 7.4074074074074075e-06, + "loss": 0.3642, + "step": 12000 + }, + { + "epoch": 8.571428571428571, + "eval_loss": 0.3329981863498688, + "eval_rouge1": 0.9114, + "eval_rouge2": 0.8538, + "eval_rougeL": 0.9087, + "eval_runtime": 122.2883, + "eval_samples_per_second": 11.448, + "eval_steps_per_second": 5.724, + "step": 12000 + }, + { + "epoch": 8.57857142857143, + "grad_norm": 2.582113027572632, + "learning_rate": 7.370370370370371e-06, + "loss": 0.3589, + "step": 12010 + }, + { + "epoch": 8.585714285714285, + "grad_norm": 1.1606924533843994, + "learning_rate": 7.333333333333334e-06, + "loss": 0.2036, + "step": 12020 + }, + { + "epoch": 8.592857142857143, + "grad_norm": 1.990123987197876, + "learning_rate": 7.296296296296297e-06, + "loss": 0.2347, + "step": 12030 + }, + { + "epoch": 8.6, + "grad_norm": 1.3593547344207764, + "learning_rate": 7.2592592592592605e-06, + "loss": 0.1475, + "step": 12040 + }, + { + "epoch": 8.607142857142858, + "grad_norm": 0.7821537256240845, + "learning_rate": 7.222222222222222e-06, + "loss": 0.2889, + "step": 12050 + }, + { + "epoch": 8.614285714285714, + "grad_norm": 2.2297816276550293, + "learning_rate": 7.185185185185185e-06, + "loss": 0.2495, + "step": 12060 + }, + { + "epoch": 8.621428571428572, + "grad_norm": 1.9694111347198486, + "learning_rate": 7.1481481481481486e-06, + "loss": 0.2182, + "step": 12070 + }, + { + "epoch": 8.628571428571428, + "grad_norm": 1.3611793518066406, + "learning_rate": 7.111111111111112e-06, + "loss": 0.2116, + "step": 12080 + }, + { + "epoch": 8.635714285714286, + "grad_norm": 0.9108260869979858, + "learning_rate": 7.074074074074074e-06, + "loss": 0.2229, + "step": 12090 + }, + { + "epoch": 8.642857142857142, + "grad_norm": 2.577470302581787, + "learning_rate": 7.0370370370370375e-06, + "loss": 0.3132, + "step": 12100 + }, + { + "epoch": 8.65, + "grad_norm": 0.837846040725708, + "learning_rate": 7.000000000000001e-06, + "loss": 0.1791, + "step": 12110 + }, + { + "epoch": 8.657142857142857, + "grad_norm": 2.7253611087799072, + "learning_rate": 6.962962962962963e-06, + "loss": 0.1998, + "step": 12120 + }, + { + "epoch": 8.664285714285715, + "grad_norm": 1.6284639835357666, + "learning_rate": 6.925925925925926e-06, + "loss": 0.2917, + "step": 12130 + }, + { + "epoch": 8.67142857142857, + "grad_norm": 1.0358765125274658, + "learning_rate": 6.888888888888889e-06, + "loss": 0.2949, + "step": 12140 + }, + { + "epoch": 8.678571428571429, + "grad_norm": 1.9100443124771118, + "learning_rate": 6.851851851851852e-06, + "loss": 0.1639, + "step": 12150 + }, + { + "epoch": 8.685714285714285, + "grad_norm": 1.4746047258377075, + "learning_rate": 6.814814814814815e-06, + "loss": 0.1798, + "step": 12160 + }, + { + "epoch": 8.692857142857143, + "grad_norm": 1.1177834272384644, + "learning_rate": 6.777777777777779e-06, + "loss": 0.2687, + "step": 12170 + }, + { + "epoch": 8.7, + "grad_norm": 0.8141186833381653, + "learning_rate": 6.74074074074074e-06, + "loss": 0.1627, + "step": 12180 + }, + { + "epoch": 8.707142857142857, + "grad_norm": 1.4017144441604614, + "learning_rate": 6.703703703703703e-06, + "loss": 0.1938, + "step": 12190 + }, + { + "epoch": 8.714285714285714, + "grad_norm": 0.7198919653892517, + "learning_rate": 6.666666666666667e-06, + "loss": 0.222, + "step": 12200 + }, + { + "epoch": 8.721428571428572, + "grad_norm": 1.4436593055725098, + "learning_rate": 6.62962962962963e-06, + "loss": 0.2821, + "step": 12210 + }, + { + "epoch": 8.728571428571428, + "grad_norm": 1.8222748041152954, + "learning_rate": 6.592592592592593e-06, + "loss": 0.3652, + "step": 12220 + }, + { + "epoch": 8.735714285714286, + "grad_norm": 1.033396601676941, + "learning_rate": 6.555555555555556e-06, + "loss": 0.1775, + "step": 12230 + }, + { + "epoch": 8.742857142857144, + "grad_norm": 1.3893368244171143, + "learning_rate": 6.518518518518518e-06, + "loss": 0.3292, + "step": 12240 + }, + { + "epoch": 8.75, + "grad_norm": 1.7294946908950806, + "learning_rate": 6.481481481481481e-06, + "loss": 0.3136, + "step": 12250 + }, + { + "epoch": 8.757142857142856, + "grad_norm": 1.5931612253189087, + "learning_rate": 6.4444444444444445e-06, + "loss": 0.2498, + "step": 12260 + }, + { + "epoch": 8.764285714285714, + "grad_norm": 1.6048915386199951, + "learning_rate": 6.407407407407408e-06, + "loss": 0.2996, + "step": 12270 + }, + { + "epoch": 8.771428571428572, + "grad_norm": 0.9885932803153992, + "learning_rate": 6.370370370370371e-06, + "loss": 0.1647, + "step": 12280 + }, + { + "epoch": 8.778571428571428, + "grad_norm": 4.228821277618408, + "learning_rate": 6.333333333333334e-06, + "loss": 0.2533, + "step": 12290 + }, + { + "epoch": 8.785714285714286, + "grad_norm": 2.150721788406372, + "learning_rate": 6.296296296296296e-06, + "loss": 0.2922, + "step": 12300 + }, + { + "epoch": 8.792857142857143, + "grad_norm": 2.0247929096221924, + "learning_rate": 6.259259259259259e-06, + "loss": 0.1613, + "step": 12310 + }, + { + "epoch": 8.8, + "grad_norm": 0.5003380179405212, + "learning_rate": 6.222222222222222e-06, + "loss": 0.1459, + "step": 12320 + }, + { + "epoch": 8.807142857142857, + "grad_norm": 0.9374644756317139, + "learning_rate": 6.1851851851851856e-06, + "loss": 0.2384, + "step": 12330 + }, + { + "epoch": 8.814285714285715, + "grad_norm": 1.5784926414489746, + "learning_rate": 6.148148148148149e-06, + "loss": 0.2458, + "step": 12340 + }, + { + "epoch": 8.821428571428571, + "grad_norm": 1.6696372032165527, + "learning_rate": 6.111111111111111e-06, + "loss": 0.2787, + "step": 12350 + }, + { + "epoch": 8.82857142857143, + "grad_norm": 1.2524793148040771, + "learning_rate": 6.0740740740740745e-06, + "loss": 0.2445, + "step": 12360 + }, + { + "epoch": 8.835714285714285, + "grad_norm": 2.176713466644287, + "learning_rate": 6.037037037037038e-06, + "loss": 0.1896, + "step": 12370 + }, + { + "epoch": 8.842857142857143, + "grad_norm": 1.964752197265625, + "learning_rate": 6e-06, + "loss": 0.261, + "step": 12380 + }, + { + "epoch": 8.85, + "grad_norm": 2.006638526916504, + "learning_rate": 5.962962962962963e-06, + "loss": 0.2415, + "step": 12390 + }, + { + "epoch": 8.857142857142858, + "grad_norm": 1.9313900470733643, + "learning_rate": 5.925925925925927e-06, + "loss": 0.3772, + "step": 12400 + }, + { + "epoch": 8.864285714285714, + "grad_norm": 2.112170934677124, + "learning_rate": 5.888888888888889e-06, + "loss": 0.2396, + "step": 12410 + }, + { + "epoch": 8.871428571428572, + "grad_norm": 2.7346787452697754, + "learning_rate": 5.851851851851852e-06, + "loss": 0.325, + "step": 12420 + }, + { + "epoch": 8.878571428571428, + "grad_norm": 3.1768178939819336, + "learning_rate": 5.814814814814816e-06, + "loss": 0.2393, + "step": 12430 + }, + { + "epoch": 8.885714285714286, + "grad_norm": 2.164926767349243, + "learning_rate": 5.777777777777778e-06, + "loss": 0.2444, + "step": 12440 + }, + { + "epoch": 8.892857142857142, + "grad_norm": 0.8274826407432556, + "learning_rate": 5.740740740740741e-06, + "loss": 0.2662, + "step": 12450 + }, + { + "epoch": 8.9, + "grad_norm": 1.024776816368103, + "learning_rate": 5.7037037037037045e-06, + "loss": 0.2059, + "step": 12460 + }, + { + "epoch": 8.907142857142857, + "grad_norm": 1.1089740991592407, + "learning_rate": 5.666666666666667e-06, + "loss": 0.2529, + "step": 12470 + }, + { + "epoch": 8.914285714285715, + "grad_norm": 1.3669365644454956, + "learning_rate": 5.62962962962963e-06, + "loss": 0.2432, + "step": 12480 + }, + { + "epoch": 8.92142857142857, + "grad_norm": 1.9711703062057495, + "learning_rate": 5.592592592592593e-06, + "loss": 0.127, + "step": 12490 + }, + { + "epoch": 8.928571428571429, + "grad_norm": 1.4796557426452637, + "learning_rate": 5.555555555555556e-06, + "loss": 0.2244, + "step": 12500 + }, + { + "epoch": 8.935714285714285, + "grad_norm": 2.029303789138794, + "learning_rate": 5.518518518518519e-06, + "loss": 0.2939, + "step": 12510 + }, + { + "epoch": 8.942857142857143, + "grad_norm": 2.0809414386749268, + "learning_rate": 5.4814814814814815e-06, + "loss": 0.2146, + "step": 12520 + }, + { + "epoch": 8.95, + "grad_norm": 1.2555458545684814, + "learning_rate": 5.444444444444445e-06, + "loss": 0.2116, + "step": 12530 + }, + { + "epoch": 8.957142857142857, + "grad_norm": 2.3402106761932373, + "learning_rate": 5.407407407407407e-06, + "loss": 0.1587, + "step": 12540 + }, + { + "epoch": 8.964285714285714, + "grad_norm": 1.2514257431030273, + "learning_rate": 5.37037037037037e-06, + "loss": 0.1598, + "step": 12550 + }, + { + "epoch": 8.971428571428572, + "grad_norm": 3.1680376529693604, + "learning_rate": 5.333333333333334e-06, + "loss": 0.2925, + "step": 12560 + }, + { + "epoch": 8.978571428571428, + "grad_norm": 2.482211112976074, + "learning_rate": 5.296296296296296e-06, + "loss": 0.2152, + "step": 12570 + }, + { + "epoch": 8.985714285714286, + "grad_norm": 2.3472371101379395, + "learning_rate": 5.259259259259259e-06, + "loss": 0.2079, + "step": 12580 + }, + { + "epoch": 8.992857142857144, + "grad_norm": 2.52826189994812, + "learning_rate": 5.2222222222222226e-06, + "loss": 0.2124, + "step": 12590 + }, + { + "epoch": 9.0, + "grad_norm": 1.8535226583480835, + "learning_rate": 5.185185185185185e-06, + "loss": 0.232, + "step": 12600 + }, + { + "epoch": 9.007142857142858, + "grad_norm": 2.8780601024627686, + "learning_rate": 5.148148148148148e-06, + "loss": 0.3111, + "step": 12610 + }, + { + "epoch": 9.014285714285714, + "grad_norm": 1.5909632444381714, + "learning_rate": 5.1111111111111115e-06, + "loss": 0.2788, + "step": 12620 + }, + { + "epoch": 9.021428571428572, + "grad_norm": 1.7076550722122192, + "learning_rate": 5.074074074074074e-06, + "loss": 0.2575, + "step": 12630 + }, + { + "epoch": 9.028571428571428, + "grad_norm": 2.8672637939453125, + "learning_rate": 5.037037037037037e-06, + "loss": 0.2228, + "step": 12640 + }, + { + "epoch": 9.035714285714286, + "grad_norm": 1.618055820465088, + "learning_rate": 5e-06, + "loss": 0.1157, + "step": 12650 + }, + { + "epoch": 9.042857142857143, + "grad_norm": 1.407122015953064, + "learning_rate": 4.962962962962963e-06, + "loss": 0.2321, + "step": 12660 + }, + { + "epoch": 9.05, + "grad_norm": 1.6875501871109009, + "learning_rate": 4.925925925925926e-06, + "loss": 0.2938, + "step": 12670 + }, + { + "epoch": 9.057142857142857, + "grad_norm": 1.0872751474380493, + "learning_rate": 4.888888888888889e-06, + "loss": 0.181, + "step": 12680 + }, + { + "epoch": 9.064285714285715, + "grad_norm": 1.689308524131775, + "learning_rate": 4.851851851851852e-06, + "loss": 0.1147, + "step": 12690 + }, + { + "epoch": 9.071428571428571, + "grad_norm": 2.5880138874053955, + "learning_rate": 4.814814814814815e-06, + "loss": 0.2757, + "step": 12700 + }, + { + "epoch": 9.07857142857143, + "grad_norm": 2.9612958431243896, + "learning_rate": 4.777777777777778e-06, + "loss": 0.2847, + "step": 12710 + }, + { + "epoch": 9.085714285714285, + "grad_norm": 1.7059327363967896, + "learning_rate": 4.740740740740741e-06, + "loss": 0.195, + "step": 12720 + }, + { + "epoch": 9.092857142857143, + "grad_norm": 1.702331304550171, + "learning_rate": 4.703703703703704e-06, + "loss": 0.2408, + "step": 12730 + }, + { + "epoch": 9.1, + "grad_norm": 1.814587950706482, + "learning_rate": 4.666666666666667e-06, + "loss": 0.3085, + "step": 12740 + }, + { + "epoch": 9.107142857142858, + "grad_norm": 2.863785743713379, + "learning_rate": 4.6296296296296296e-06, + "loss": 0.2667, + "step": 12750 + }, + { + "epoch": 9.114285714285714, + "grad_norm": 1.6600861549377441, + "learning_rate": 4.592592592592593e-06, + "loss": 0.1376, + "step": 12760 + }, + { + "epoch": 9.121428571428572, + "grad_norm": 1.3329426050186157, + "learning_rate": 4.555555555555556e-06, + "loss": 0.2547, + "step": 12770 + }, + { + "epoch": 9.128571428571428, + "grad_norm": 4.032371997833252, + "learning_rate": 4.5185185185185185e-06, + "loss": 0.4649, + "step": 12780 + }, + { + "epoch": 9.135714285714286, + "grad_norm": 2.709066390991211, + "learning_rate": 4.481481481481482e-06, + "loss": 0.2336, + "step": 12790 + }, + { + "epoch": 9.142857142857142, + "grad_norm": 1.063931941986084, + "learning_rate": 4.444444444444445e-06, + "loss": 0.1731, + "step": 12800 + }, + { + "epoch": 9.15, + "grad_norm": 1.3110073804855347, + "learning_rate": 4.407407407407407e-06, + "loss": 0.1381, + "step": 12810 + }, + { + "epoch": 9.157142857142857, + "grad_norm": 1.9574129581451416, + "learning_rate": 4.370370370370371e-06, + "loss": 0.1958, + "step": 12820 + }, + { + "epoch": 9.164285714285715, + "grad_norm": 2.5129504203796387, + "learning_rate": 4.333333333333334e-06, + "loss": 0.2162, + "step": 12830 + }, + { + "epoch": 9.17142857142857, + "grad_norm": 1.842850923538208, + "learning_rate": 4.296296296296296e-06, + "loss": 0.2746, + "step": 12840 + }, + { + "epoch": 9.178571428571429, + "grad_norm": 1.1451313495635986, + "learning_rate": 4.2592592592592596e-06, + "loss": 0.2442, + "step": 12850 + }, + { + "epoch": 9.185714285714285, + "grad_norm": 2.0765175819396973, + "learning_rate": 4.222222222222223e-06, + "loss": 0.1531, + "step": 12860 + }, + { + "epoch": 9.192857142857143, + "grad_norm": 0.9250321984291077, + "learning_rate": 4.185185185185185e-06, + "loss": 0.2876, + "step": 12870 + }, + { + "epoch": 9.2, + "grad_norm": 1.5332380533218384, + "learning_rate": 4.1481481481481485e-06, + "loss": 0.2255, + "step": 12880 + }, + { + "epoch": 9.207142857142857, + "grad_norm": 3.1344316005706787, + "learning_rate": 4.111111111111112e-06, + "loss": 0.2654, + "step": 12890 + }, + { + "epoch": 9.214285714285714, + "grad_norm": 1.4092166423797607, + "learning_rate": 4.074074074074075e-06, + "loss": 0.2358, + "step": 12900 + }, + { + "epoch": 9.221428571428572, + "grad_norm": 1.517716646194458, + "learning_rate": 4.037037037037037e-06, + "loss": 0.2455, + "step": 12910 + }, + { + "epoch": 9.228571428571428, + "grad_norm": 0.7355996966362, + "learning_rate": 4.000000000000001e-06, + "loss": 0.1761, + "step": 12920 + }, + { + "epoch": 9.235714285714286, + "grad_norm": 2.077259063720703, + "learning_rate": 3.962962962962963e-06, + "loss": 0.2791, + "step": 12930 + }, + { + "epoch": 9.242857142857142, + "grad_norm": 1.3175309896469116, + "learning_rate": 3.925925925925926e-06, + "loss": 0.2296, + "step": 12940 + }, + { + "epoch": 9.25, + "grad_norm": 1.0608943700790405, + "learning_rate": 3.888888888888889e-06, + "loss": 0.1609, + "step": 12950 + }, + { + "epoch": 9.257142857142856, + "grad_norm": 2.2581288814544678, + "learning_rate": 3.851851851851852e-06, + "loss": 0.2192, + "step": 12960 + }, + { + "epoch": 9.264285714285714, + "grad_norm": 1.672400951385498, + "learning_rate": 3.814814814814814e-06, + "loss": 0.1548, + "step": 12970 + }, + { + "epoch": 9.271428571428572, + "grad_norm": 0.7743004560470581, + "learning_rate": 3.777777777777778e-06, + "loss": 0.3168, + "step": 12980 + }, + { + "epoch": 9.278571428571428, + "grad_norm": 0.9588621854782104, + "learning_rate": 3.7407407407407413e-06, + "loss": 0.1585, + "step": 12990 + }, + { + "epoch": 9.285714285714286, + "grad_norm": 1.7508875131607056, + "learning_rate": 3.7037037037037037e-06, + "loss": 0.2137, + "step": 13000 + }, + { + "epoch": 9.285714285714286, + "eval_loss": 0.3308734893798828, + "eval_rouge1": 0.9118, + "eval_rouge2": 0.8545, + "eval_rougeL": 0.909, + "eval_runtime": 122.198, + "eval_samples_per_second": 11.457, + "eval_steps_per_second": 5.728, + "step": 13000 + } + ], + "logging_steps": 10, + "max_steps": 14000, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 1000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 2, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.24700797698048e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}