|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"global_step": 59688, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00019832462136442838, |
|
"loss": 70.136, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00019664924272885673, |
|
"loss": 54.3396, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 55.01993179321289, |
|
"eval_runtime": 0.5271, |
|
"eval_samples_per_second": 94.864, |
|
"eval_steps_per_second": 3.795, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00019497386409328508, |
|
"loss": 52.5498, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00019329848545771345, |
|
"loss": 51.633, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 53.25694274902344, |
|
"eval_runtime": 0.5629, |
|
"eval_samples_per_second": 88.83, |
|
"eval_steps_per_second": 3.553, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00019162310682214182, |
|
"loss": 51.0517, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0001899477281865702, |
|
"loss": 50.6422, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_loss": 52.43913269042969, |
|
"eval_runtime": 0.506, |
|
"eval_samples_per_second": 98.817, |
|
"eval_steps_per_second": 3.953, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00018827234955099854, |
|
"loss": 50.2641, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0001865969709154269, |
|
"loss": 50.023, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 52.02763748168945, |
|
"eval_runtime": 0.5162, |
|
"eval_samples_per_second": 96.869, |
|
"eval_steps_per_second": 3.875, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 0.00018492159227985526, |
|
"loss": 49.8153, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 0.00018324621364428363, |
|
"loss": 49.7182, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 51.66868209838867, |
|
"eval_runtime": 0.4836, |
|
"eval_samples_per_second": 103.392, |
|
"eval_steps_per_second": 4.136, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 0.00018157083500871198, |
|
"loss": 49.5574, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 0.00017989545637314033, |
|
"loss": 49.4063, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 51.36139678955078, |
|
"eval_runtime": 0.487, |
|
"eval_samples_per_second": 102.67, |
|
"eval_steps_per_second": 4.107, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 0.0001782200777375687, |
|
"loss": 49.3062, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 0.00017654469910199707, |
|
"loss": 49.2278, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_loss": 51.16618347167969, |
|
"eval_runtime": 0.4852, |
|
"eval_samples_per_second": 103.045, |
|
"eval_steps_per_second": 4.122, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 0.00017486932046642542, |
|
"loss": 49.1404, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 0.0001731939418308538, |
|
"loss": 49.0291, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 50.956336975097656, |
|
"eval_runtime": 0.4843, |
|
"eval_samples_per_second": 103.238, |
|
"eval_steps_per_second": 4.13, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 0.00017151856319528214, |
|
"loss": 48.966, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 0.0001698431845597105, |
|
"loss": 48.8771, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_loss": 50.9631233215332, |
|
"eval_runtime": 0.4832, |
|
"eval_samples_per_second": 103.477, |
|
"eval_steps_per_second": 4.139, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 0.00016816780592413886, |
|
"loss": 48.842, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 0.00016649242728856723, |
|
"loss": 48.8414, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 50.825313568115234, |
|
"eval_runtime": 0.484, |
|
"eval_samples_per_second": 103.302, |
|
"eval_steps_per_second": 4.132, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 0.00016481704865299558, |
|
"loss": 48.7524, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 0.00016314167001742395, |
|
"loss": 48.6891, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"eval_loss": 50.6945686340332, |
|
"eval_runtime": 0.4902, |
|
"eval_samples_per_second": 101.994, |
|
"eval_steps_per_second": 4.08, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 0.0001614662913818523, |
|
"loss": 48.6808, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 0.00015979091274628067, |
|
"loss": 48.6235, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 50.51723098754883, |
|
"eval_runtime": 0.4899, |
|
"eval_samples_per_second": 102.054, |
|
"eval_steps_per_second": 4.082, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 0.00015811553411070902, |
|
"loss": 48.5341, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 0.0001564401554751374, |
|
"loss": 48.5, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 50.48223114013672, |
|
"eval_runtime": 0.4899, |
|
"eval_samples_per_second": 102.056, |
|
"eval_steps_per_second": 4.082, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 0.00015476477683956574, |
|
"loss": 48.4708, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 0.0001530893982039941, |
|
"loss": 48.4386, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 50.327213287353516, |
|
"eval_runtime": 0.4814, |
|
"eval_samples_per_second": 103.863, |
|
"eval_steps_per_second": 4.155, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 0.00015141401956842248, |
|
"loss": 48.4488, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 0.00014973864093285083, |
|
"loss": 48.3875, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 50.33498001098633, |
|
"eval_runtime": 0.5229, |
|
"eval_samples_per_second": 95.612, |
|
"eval_steps_per_second": 3.824, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 0.00014806326229727917, |
|
"loss": 48.3842, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 0.00014638788366170755, |
|
"loss": 48.3353, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 50.22550964355469, |
|
"eval_runtime": 0.5249, |
|
"eval_samples_per_second": 95.248, |
|
"eval_steps_per_second": 3.81, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 0.00014471250502613592, |
|
"loss": 48.3718, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 0.00014303712639056427, |
|
"loss": 48.3404, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"eval_loss": 50.19435501098633, |
|
"eval_runtime": 0.5549, |
|
"eval_samples_per_second": 90.105, |
|
"eval_steps_per_second": 3.604, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 0.00014136174775499264, |
|
"loss": 48.2976, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 0.000139686369119421, |
|
"loss": 48.2946, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 50.1772346496582, |
|
"eval_runtime": 0.6058, |
|
"eval_samples_per_second": 82.531, |
|
"eval_steps_per_second": 3.301, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 0.00013801099048384936, |
|
"loss": 48.2515, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 0.00013633561184827773, |
|
"loss": 48.2941, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_loss": 50.101707458496094, |
|
"eval_runtime": 0.55, |
|
"eval_samples_per_second": 90.916, |
|
"eval_steps_per_second": 3.637, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 0.00013466023321270608, |
|
"loss": 48.1908, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"learning_rate": 0.00013298485457713443, |
|
"loss": 48.1547, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"eval_loss": 50.0978889465332, |
|
"eval_runtime": 0.4898, |
|
"eval_samples_per_second": 102.081, |
|
"eval_steps_per_second": 4.083, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"learning_rate": 0.0001313094759415628, |
|
"loss": 47.9981, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"learning_rate": 0.00012963409730599117, |
|
"loss": 48.0147, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"eval_loss": 50.161685943603516, |
|
"eval_runtime": 0.4871, |
|
"eval_samples_per_second": 102.644, |
|
"eval_steps_per_second": 4.106, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"learning_rate": 0.00012795871867041952, |
|
"loss": 47.9851, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"learning_rate": 0.00012628334003484786, |
|
"loss": 47.936, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"eval_loss": 50.107810974121094, |
|
"eval_runtime": 0.4846, |
|
"eval_samples_per_second": 103.186, |
|
"eval_steps_per_second": 4.127, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"learning_rate": 0.00012460796139927624, |
|
"loss": 48.0228, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"learning_rate": 0.0001229325827637046, |
|
"loss": 47.9642, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"eval_loss": 50.060237884521484, |
|
"eval_runtime": 0.4856, |
|
"eval_samples_per_second": 102.961, |
|
"eval_steps_per_second": 4.118, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 0.00012125720412813297, |
|
"loss": 47.9917, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"learning_rate": 0.00011958182549256132, |
|
"loss": 47.9531, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"eval_loss": 50.01976013183594, |
|
"eval_runtime": 0.483, |
|
"eval_samples_per_second": 103.524, |
|
"eval_steps_per_second": 4.141, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"learning_rate": 0.00011790644685698968, |
|
"loss": 47.9594, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"learning_rate": 0.00011623106822141805, |
|
"loss": 47.9126, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"eval_loss": 50.01350021362305, |
|
"eval_runtime": 0.4811, |
|
"eval_samples_per_second": 103.929, |
|
"eval_steps_per_second": 4.157, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"learning_rate": 0.00011455568958584641, |
|
"loss": 47.9141, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"learning_rate": 0.00011288031095027476, |
|
"loss": 47.9471, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"eval_loss": 49.97874450683594, |
|
"eval_runtime": 0.5117, |
|
"eval_samples_per_second": 97.707, |
|
"eval_steps_per_second": 3.908, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"learning_rate": 0.00011120493231470313, |
|
"loss": 47.9071, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"learning_rate": 0.00010952955367913149, |
|
"loss": 47.9708, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"eval_loss": 49.936649322509766, |
|
"eval_runtime": 0.4804, |
|
"eval_samples_per_second": 104.085, |
|
"eval_steps_per_second": 4.163, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"learning_rate": 0.00010785417504355986, |
|
"loss": 47.9294, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"learning_rate": 0.00010617879640798821, |
|
"loss": 47.8889, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"eval_loss": 49.88063049316406, |
|
"eval_runtime": 0.5134, |
|
"eval_samples_per_second": 97.396, |
|
"eval_steps_per_second": 3.896, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"learning_rate": 0.00010450341777241657, |
|
"loss": 47.9306, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"learning_rate": 0.00010282803913684493, |
|
"loss": 47.909, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"eval_loss": 49.8420524597168, |
|
"eval_runtime": 0.556, |
|
"eval_samples_per_second": 89.932, |
|
"eval_steps_per_second": 3.597, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"learning_rate": 0.0001011526605012733, |
|
"loss": 47.9044, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"learning_rate": 9.947728186570166e-05, |
|
"loss": 47.8723, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"eval_loss": 49.780277252197266, |
|
"eval_runtime": 0.4985, |
|
"eval_samples_per_second": 100.306, |
|
"eval_steps_per_second": 4.012, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"learning_rate": 9.780190323013e-05, |
|
"loss": 47.8765, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"learning_rate": 9.612652459455838e-05, |
|
"loss": 47.865, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"eval_loss": 49.72420883178711, |
|
"eval_runtime": 0.5031, |
|
"eval_samples_per_second": 99.377, |
|
"eval_steps_per_second": 3.975, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"learning_rate": 9.445114595898673e-05, |
|
"loss": 47.8886, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"learning_rate": 9.27757673234151e-05, |
|
"loss": 47.8624, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"eval_loss": 49.74140548706055, |
|
"eval_runtime": 0.5052, |
|
"eval_samples_per_second": 98.962, |
|
"eval_steps_per_second": 3.958, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"learning_rate": 9.110038868784345e-05, |
|
"loss": 47.8765, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"learning_rate": 8.942501005227182e-05, |
|
"loss": 47.8475, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"eval_loss": 49.755550384521484, |
|
"eval_runtime": 0.5482, |
|
"eval_samples_per_second": 91.209, |
|
"eval_steps_per_second": 3.648, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"learning_rate": 8.774963141670018e-05, |
|
"loss": 47.8899, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"learning_rate": 8.607425278112854e-05, |
|
"loss": 47.8309, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"eval_loss": 49.78474044799805, |
|
"eval_runtime": 0.511, |
|
"eval_samples_per_second": 97.843, |
|
"eval_steps_per_second": 3.914, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"learning_rate": 8.439887414555691e-05, |
|
"loss": 47.8192, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"learning_rate": 8.272349550998526e-05, |
|
"loss": 47.8468, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"eval_loss": 49.781124114990234, |
|
"eval_runtime": 0.5263, |
|
"eval_samples_per_second": 95.0, |
|
"eval_steps_per_second": 3.8, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"learning_rate": 8.104811687441363e-05, |
|
"loss": 47.7881, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"learning_rate": 7.937273823884198e-05, |
|
"loss": 47.85, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"eval_loss": 49.724666595458984, |
|
"eval_runtime": 0.5014, |
|
"eval_samples_per_second": 99.722, |
|
"eval_steps_per_second": 3.989, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"learning_rate": 7.769735960327035e-05, |
|
"loss": 47.7805, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"learning_rate": 7.60219809676987e-05, |
|
"loss": 47.7769, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"eval_loss": 49.74330139160156, |
|
"eval_runtime": 0.5536, |
|
"eval_samples_per_second": 90.316, |
|
"eval_steps_per_second": 3.613, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"learning_rate": 7.434660233212707e-05, |
|
"loss": 47.8081, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"learning_rate": 7.267122369655543e-05, |
|
"loss": 47.8395, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"eval_loss": 49.658382415771484, |
|
"eval_runtime": 0.5279, |
|
"eval_samples_per_second": 94.711, |
|
"eval_steps_per_second": 3.788, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"learning_rate": 7.099584506098379e-05, |
|
"loss": 47.8198, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"learning_rate": 6.932046642541215e-05, |
|
"loss": 47.7978, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"eval_loss": 49.645263671875, |
|
"eval_runtime": 0.4826, |
|
"eval_samples_per_second": 103.596, |
|
"eval_steps_per_second": 4.144, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"learning_rate": 6.764508778984051e-05, |
|
"loss": 47.8224, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"learning_rate": 6.596970915426887e-05, |
|
"loss": 47.7541, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"eval_loss": 49.628944396972656, |
|
"eval_runtime": 0.484, |
|
"eval_samples_per_second": 103.305, |
|
"eval_steps_per_second": 4.132, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"learning_rate": 6.429433051869723e-05, |
|
"loss": 47.6855, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"learning_rate": 6.261895188312559e-05, |
|
"loss": 47.6644, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"eval_loss": 49.626243591308594, |
|
"eval_runtime": 0.5063, |
|
"eval_samples_per_second": 98.747, |
|
"eval_steps_per_second": 3.95, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"learning_rate": 6.0943573247553954e-05, |
|
"loss": 47.6146, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"learning_rate": 5.926819461198231e-05, |
|
"loss": 47.6472, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"eval_loss": 49.61125183105469, |
|
"eval_runtime": 0.483, |
|
"eval_samples_per_second": 103.514, |
|
"eval_steps_per_second": 4.141, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"learning_rate": 5.7592815976410674e-05, |
|
"loss": 47.6434, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"learning_rate": 5.5917437340839026e-05, |
|
"loss": 47.6436, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"eval_loss": 49.59366226196289, |
|
"eval_runtime": 0.4865, |
|
"eval_samples_per_second": 102.777, |
|
"eval_steps_per_second": 4.111, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"learning_rate": 5.424205870526739e-05, |
|
"loss": 47.6443, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"learning_rate": 5.256668006969575e-05, |
|
"loss": 47.6082, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"eval_loss": 49.62874984741211, |
|
"eval_runtime": 0.4801, |
|
"eval_samples_per_second": 104.141, |
|
"eval_steps_per_second": 4.166, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"learning_rate": 5.089130143412412e-05, |
|
"loss": 47.6407, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"learning_rate": 4.921592279855248e-05, |
|
"loss": 47.63, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"eval_loss": 49.61036682128906, |
|
"eval_runtime": 0.4794, |
|
"eval_samples_per_second": 104.297, |
|
"eval_steps_per_second": 4.172, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"learning_rate": 4.754054416298084e-05, |
|
"loss": 47.6592, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"learning_rate": 4.58651655274092e-05, |
|
"loss": 47.6281, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"eval_loss": 49.57414245605469, |
|
"eval_runtime": 0.4781, |
|
"eval_samples_per_second": 104.584, |
|
"eval_steps_per_second": 4.183, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"learning_rate": 4.418978689183756e-05, |
|
"loss": 47.6234, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"learning_rate": 4.251440825626592e-05, |
|
"loss": 47.661, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"eval_loss": 49.538631439208984, |
|
"eval_runtime": 0.4811, |
|
"eval_samples_per_second": 103.938, |
|
"eval_steps_per_second": 4.158, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"learning_rate": 4.083902962069428e-05, |
|
"loss": 47.6321, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"learning_rate": 3.9163650985122644e-05, |
|
"loss": 47.5848, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"eval_loss": 49.5694580078125, |
|
"eval_runtime": 0.4826, |
|
"eval_samples_per_second": 103.601, |
|
"eval_steps_per_second": 4.144, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"learning_rate": 3.7488272349551004e-05, |
|
"loss": 47.6428, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"learning_rate": 3.581289371397936e-05, |
|
"loss": 47.6209, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"eval_loss": 49.554813385009766, |
|
"eval_runtime": 0.4889, |
|
"eval_samples_per_second": 102.277, |
|
"eval_steps_per_second": 4.091, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"learning_rate": 3.413751507840772e-05, |
|
"loss": 47.6301, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"learning_rate": 3.246213644283608e-05, |
|
"loss": 47.637, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"eval_loss": 49.56814956665039, |
|
"eval_runtime": 0.483, |
|
"eval_samples_per_second": 103.514, |
|
"eval_steps_per_second": 4.141, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"learning_rate": 3.078675780726444e-05, |
|
"loss": 47.6194, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"learning_rate": 2.9111379171692806e-05, |
|
"loss": 47.6155, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"eval_loss": 49.561641693115234, |
|
"eval_runtime": 0.4803, |
|
"eval_samples_per_second": 104.094, |
|
"eval_steps_per_second": 4.164, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"learning_rate": 2.7436000536121165e-05, |
|
"loss": 47.5766, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"learning_rate": 2.5760621900549525e-05, |
|
"loss": 47.605, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"eval_loss": 49.548133850097656, |
|
"eval_runtime": 0.4783, |
|
"eval_samples_per_second": 104.528, |
|
"eval_steps_per_second": 4.181, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"learning_rate": 2.4085243264977885e-05, |
|
"loss": 47.5708, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"learning_rate": 2.2409864629406248e-05, |
|
"loss": 47.6354, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"eval_loss": 49.530643463134766, |
|
"eval_runtime": 0.4813, |
|
"eval_samples_per_second": 103.878, |
|
"eval_steps_per_second": 4.155, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"learning_rate": 2.0734485993834608e-05, |
|
"loss": 47.5549, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"learning_rate": 1.9059107358262967e-05, |
|
"loss": 47.6402, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"eval_loss": 49.539710998535156, |
|
"eval_runtime": 0.4791, |
|
"eval_samples_per_second": 104.352, |
|
"eval_steps_per_second": 4.174, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"learning_rate": 1.7383728722691327e-05, |
|
"loss": 47.5883, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"learning_rate": 1.570835008711969e-05, |
|
"loss": 47.607, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"eval_loss": 49.52818298339844, |
|
"eval_runtime": 0.5295, |
|
"eval_samples_per_second": 94.423, |
|
"eval_steps_per_second": 3.777, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"learning_rate": 1.403297145154805e-05, |
|
"loss": 47.5659, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"learning_rate": 1.2357592815976411e-05, |
|
"loss": 47.577, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"eval_loss": 49.54225540161133, |
|
"eval_runtime": 0.478, |
|
"eval_samples_per_second": 104.606, |
|
"eval_steps_per_second": 4.184, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"learning_rate": 1.0682214180404771e-05, |
|
"loss": 47.5713, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"learning_rate": 9.006835544833132e-06, |
|
"loss": 47.5776, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"eval_loss": 49.539093017578125, |
|
"eval_runtime": 0.5132, |
|
"eval_samples_per_second": 97.431, |
|
"eval_steps_per_second": 3.897, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"learning_rate": 7.331456909261493e-06, |
|
"loss": 47.5852, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"learning_rate": 5.656078273689854e-06, |
|
"loss": 47.6098, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"eval_loss": 49.5291748046875, |
|
"eval_runtime": 0.5718, |
|
"eval_samples_per_second": 87.44, |
|
"eval_steps_per_second": 3.498, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"learning_rate": 3.980699638118215e-06, |
|
"loss": 47.5699, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"learning_rate": 2.3053210025465755e-06, |
|
"loss": 47.596, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"eval_loss": 49.524417877197266, |
|
"eval_runtime": 0.5058, |
|
"eval_samples_per_second": 98.86, |
|
"eval_steps_per_second": 3.954, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"learning_rate": 6.299423669749365e-07, |
|
"loss": 47.5627, |
|
"step": 59500 |
|
} |
|
], |
|
"max_steps": 59688, |
|
"num_train_epochs": 3, |
|
"total_flos": 1.0794540774520259e+19, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|