|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9973992197659298, |
|
"eval_steps": 500, |
|
"global_step": 384, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02080624187256177, |
|
"grad_norm": 0.6747086644172668, |
|
"learning_rate": 4e-05, |
|
"loss": 1.2614, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04161248374512354, |
|
"grad_norm": 0.8340696096420288, |
|
"learning_rate": 8e-05, |
|
"loss": 1.0737, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06241872561768531, |
|
"grad_norm": 0.35006779432296753, |
|
"learning_rate": 0.00012, |
|
"loss": 0.5388, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08322496749024708, |
|
"grad_norm": 0.13230730593204498, |
|
"learning_rate": 0.00016, |
|
"loss": 0.3531, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.10403120936280884, |
|
"grad_norm": 0.2690051198005676, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2815, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12483745123537061, |
|
"grad_norm": 0.13100554049015045, |
|
"learning_rate": 0.0001978021978021978, |
|
"loss": 0.228, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.14564369310793238, |
|
"grad_norm": 0.12533320486545563, |
|
"learning_rate": 0.00019560439560439562, |
|
"loss": 0.2046, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.16644993498049415, |
|
"grad_norm": 0.08659245818853378, |
|
"learning_rate": 0.00019340659340659342, |
|
"loss": 0.179, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.18725617685305593, |
|
"grad_norm": 0.08915656059980392, |
|
"learning_rate": 0.00019120879120879122, |
|
"loss": 0.1853, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.20806241872561768, |
|
"grad_norm": 0.08073730021715164, |
|
"learning_rate": 0.00018901098901098903, |
|
"loss": 0.1594, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.22886866059817945, |
|
"grad_norm": 0.08706673234701157, |
|
"learning_rate": 0.00018681318681318683, |
|
"loss": 0.1623, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.24967490247074123, |
|
"grad_norm": 0.08052384108304977, |
|
"learning_rate": 0.00018461538461538463, |
|
"loss": 0.1469, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.270481144343303, |
|
"grad_norm": 0.07896815240383148, |
|
"learning_rate": 0.0001824175824175824, |
|
"loss": 0.1503, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.29128738621586475, |
|
"grad_norm": 0.09580652415752411, |
|
"learning_rate": 0.00018021978021978024, |
|
"loss": 0.1504, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.31209362808842656, |
|
"grad_norm": 0.1383538842201233, |
|
"learning_rate": 0.00017802197802197802, |
|
"loss": 0.139, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3328998699609883, |
|
"grad_norm": 0.08063507080078125, |
|
"learning_rate": 0.00017582417582417582, |
|
"loss": 0.1299, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.35370611183355005, |
|
"grad_norm": 0.12305614352226257, |
|
"learning_rate": 0.00017362637362637365, |
|
"loss": 0.1299, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.37451235370611186, |
|
"grad_norm": 0.144222691655159, |
|
"learning_rate": 0.00017142857142857143, |
|
"loss": 0.1456, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.3953185955786736, |
|
"grad_norm": 0.10816789418458939, |
|
"learning_rate": 0.00016923076923076923, |
|
"loss": 0.1449, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.41612483745123535, |
|
"grad_norm": 0.09802815318107605, |
|
"learning_rate": 0.00016703296703296706, |
|
"loss": 0.1337, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.43693107932379716, |
|
"grad_norm": 0.09102603048086166, |
|
"learning_rate": 0.00016483516483516484, |
|
"loss": 0.1188, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.4577373211963589, |
|
"grad_norm": 0.08224384486675262, |
|
"learning_rate": 0.00016263736263736264, |
|
"loss": 0.1253, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.47854356306892065, |
|
"grad_norm": 0.08681938797235489, |
|
"learning_rate": 0.00016043956043956044, |
|
"loss": 0.138, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.49934980494148246, |
|
"grad_norm": 0.1142863929271698, |
|
"learning_rate": 0.00015824175824175824, |
|
"loss": 0.1454, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.5201560468140443, |
|
"grad_norm": 0.14234645664691925, |
|
"learning_rate": 0.00015604395604395605, |
|
"loss": 0.1257, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.540962288686606, |
|
"grad_norm": 0.09189997613430023, |
|
"learning_rate": 0.00015384615384615385, |
|
"loss": 0.1241, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.5617685305591678, |
|
"grad_norm": 0.09716198593378067, |
|
"learning_rate": 0.00015164835164835165, |
|
"loss": 0.1257, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.5825747724317295, |
|
"grad_norm": 0.10805213451385498, |
|
"learning_rate": 0.00014945054945054946, |
|
"loss": 0.1202, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.6033810143042913, |
|
"grad_norm": 0.11224936693906784, |
|
"learning_rate": 0.00014725274725274726, |
|
"loss": 0.1325, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.6241872561768531, |
|
"grad_norm": 0.0994434505701065, |
|
"learning_rate": 0.00014505494505494506, |
|
"loss": 0.1127, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6449934980494149, |
|
"grad_norm": 0.1242934837937355, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 0.1286, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.6657997399219766, |
|
"grad_norm": 0.0908965989947319, |
|
"learning_rate": 0.00014065934065934067, |
|
"loss": 0.1245, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.6866059817945384, |
|
"grad_norm": 0.1325254738330841, |
|
"learning_rate": 0.00013846153846153847, |
|
"loss": 0.1233, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.7074122236671001, |
|
"grad_norm": 0.11704779416322708, |
|
"learning_rate": 0.00013626373626373628, |
|
"loss": 0.1172, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.7282184655396619, |
|
"grad_norm": 0.10394897311925888, |
|
"learning_rate": 0.00013406593406593405, |
|
"loss": 0.1337, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7490247074122237, |
|
"grad_norm": 0.14531978964805603, |
|
"learning_rate": 0.00013186813186813188, |
|
"loss": 0.1344, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.7698309492847855, |
|
"grad_norm": 0.10536228120326996, |
|
"learning_rate": 0.0001296703296703297, |
|
"loss": 0.1153, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.7906371911573472, |
|
"grad_norm": 0.46365121006965637, |
|
"learning_rate": 0.00012747252747252746, |
|
"loss": 0.1248, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.811443433029909, |
|
"grad_norm": 0.148798406124115, |
|
"learning_rate": 0.00012527472527472527, |
|
"loss": 0.1255, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.8322496749024707, |
|
"grad_norm": 0.10342266410589218, |
|
"learning_rate": 0.0001230769230769231, |
|
"loss": 0.1172, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8530559167750326, |
|
"grad_norm": 0.09427472203969955, |
|
"learning_rate": 0.00012087912087912087, |
|
"loss": 0.1076, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.8738621586475943, |
|
"grad_norm": 0.1485118865966797, |
|
"learning_rate": 0.00011868131868131869, |
|
"loss": 0.1218, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.8946684005201561, |
|
"grad_norm": 0.18458151817321777, |
|
"learning_rate": 0.0001164835164835165, |
|
"loss": 0.1287, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.9154746423927178, |
|
"grad_norm": 0.1004214808344841, |
|
"learning_rate": 0.00011428571428571428, |
|
"loss": 0.1265, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.9362808842652796, |
|
"grad_norm": 0.10855648666620255, |
|
"learning_rate": 0.0001120879120879121, |
|
"loss": 0.103, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9570871261378413, |
|
"grad_norm": 0.15261195600032806, |
|
"learning_rate": 0.0001098901098901099, |
|
"loss": 0.1251, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.9778933680104032, |
|
"grad_norm": 0.12982791662216187, |
|
"learning_rate": 0.0001076923076923077, |
|
"loss": 0.1162, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.9986996098829649, |
|
"grad_norm": 0.09743143618106842, |
|
"learning_rate": 0.0001054945054945055, |
|
"loss": 0.1126, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.0195058517555267, |
|
"grad_norm": 0.0987514778971672, |
|
"learning_rate": 0.00010329670329670331, |
|
"loss": 0.1096, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.0403120936280885, |
|
"grad_norm": 0.11746949702501297, |
|
"learning_rate": 0.0001010989010989011, |
|
"loss": 0.1064, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0611183355006502, |
|
"grad_norm": 0.12490434944629669, |
|
"learning_rate": 9.89010989010989e-05, |
|
"loss": 0.0924, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.081924577373212, |
|
"grad_norm": 0.13480301201343536, |
|
"learning_rate": 9.670329670329671e-05, |
|
"loss": 0.1042, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.1027308192457737, |
|
"grad_norm": 0.10744661092758179, |
|
"learning_rate": 9.450549450549451e-05, |
|
"loss": 0.0883, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.1235370611183355, |
|
"grad_norm": 0.1190573051571846, |
|
"learning_rate": 9.230769230769232e-05, |
|
"loss": 0.0989, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.1443433029908974, |
|
"grad_norm": 0.10405765473842621, |
|
"learning_rate": 9.010989010989012e-05, |
|
"loss": 0.0851, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.165149544863459, |
|
"grad_norm": 0.11977458000183105, |
|
"learning_rate": 8.791208791208791e-05, |
|
"loss": 0.0891, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.1859557867360209, |
|
"grad_norm": 0.12529583275318146, |
|
"learning_rate": 8.571428571428571e-05, |
|
"loss": 0.0961, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.2067620286085825, |
|
"grad_norm": 0.11154180020093918, |
|
"learning_rate": 8.351648351648353e-05, |
|
"loss": 0.0854, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.2275682704811444, |
|
"grad_norm": 0.14236748218536377, |
|
"learning_rate": 8.131868131868132e-05, |
|
"loss": 0.102, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.2483745123537062, |
|
"grad_norm": 0.10751450061798096, |
|
"learning_rate": 7.912087912087912e-05, |
|
"loss": 0.099, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.2691807542262679, |
|
"grad_norm": 0.1208043098449707, |
|
"learning_rate": 7.692307692307693e-05, |
|
"loss": 0.0929, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.2899869960988297, |
|
"grad_norm": 0.11843656748533249, |
|
"learning_rate": 7.472527472527473e-05, |
|
"loss": 0.0975, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.3107932379713914, |
|
"grad_norm": 0.13307110965251923, |
|
"learning_rate": 7.252747252747253e-05, |
|
"loss": 0.0929, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.3315994798439532, |
|
"grad_norm": 0.14217737317085266, |
|
"learning_rate": 7.032967032967034e-05, |
|
"loss": 0.0897, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.352405721716515, |
|
"grad_norm": 0.2811494469642639, |
|
"learning_rate": 6.813186813186814e-05, |
|
"loss": 0.0868, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.3732119635890767, |
|
"grad_norm": 0.13263148069381714, |
|
"learning_rate": 6.593406593406594e-05, |
|
"loss": 0.0995, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.3940182054616386, |
|
"grad_norm": 0.13853707909584045, |
|
"learning_rate": 6.373626373626373e-05, |
|
"loss": 0.0925, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.4148244473342002, |
|
"grad_norm": 0.18387623131275177, |
|
"learning_rate": 6.153846153846155e-05, |
|
"loss": 0.095, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.435630689206762, |
|
"grad_norm": 0.13655337691307068, |
|
"learning_rate": 5.9340659340659345e-05, |
|
"loss": 0.0962, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.456436931079324, |
|
"grad_norm": 0.12651769816875458, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.0893, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.4772431729518856, |
|
"grad_norm": 0.12896397709846497, |
|
"learning_rate": 5.494505494505495e-05, |
|
"loss": 0.0941, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.4980494148244472, |
|
"grad_norm": 0.12387780100107193, |
|
"learning_rate": 5.274725274725275e-05, |
|
"loss": 0.0914, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.518855656697009, |
|
"grad_norm": 0.13148820400238037, |
|
"learning_rate": 5.054945054945055e-05, |
|
"loss": 0.0879, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.539661898569571, |
|
"grad_norm": 0.14551299810409546, |
|
"learning_rate": 4.8351648351648355e-05, |
|
"loss": 0.0892, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.5604681404421328, |
|
"grad_norm": 0.14105795323848724, |
|
"learning_rate": 4.615384615384616e-05, |
|
"loss": 0.0957, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.5812743823146944, |
|
"grad_norm": 0.16130925714969635, |
|
"learning_rate": 4.3956043956043955e-05, |
|
"loss": 0.0822, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.602080624187256, |
|
"grad_norm": 0.13052895665168762, |
|
"learning_rate": 4.1758241758241765e-05, |
|
"loss": 0.083, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.622886866059818, |
|
"grad_norm": 0.14026325941085815, |
|
"learning_rate": 3.956043956043956e-05, |
|
"loss": 0.0854, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.6436931079323798, |
|
"grad_norm": 0.12488283216953278, |
|
"learning_rate": 3.7362637362637365e-05, |
|
"loss": 0.085, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.6644993498049416, |
|
"grad_norm": 0.13594555854797363, |
|
"learning_rate": 3.516483516483517e-05, |
|
"loss": 0.0889, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.6853055916775033, |
|
"grad_norm": 0.1322001814842224, |
|
"learning_rate": 3.296703296703297e-05, |
|
"loss": 0.0863, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.706111833550065, |
|
"grad_norm": 0.1550651490688324, |
|
"learning_rate": 3.0769230769230774e-05, |
|
"loss": 0.0888, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.7269180754226268, |
|
"grad_norm": 0.13711196184158325, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 0.0883, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.7477243172951886, |
|
"grad_norm": 0.14158359169960022, |
|
"learning_rate": 2.6373626373626374e-05, |
|
"loss": 0.0872, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.7685305591677505, |
|
"grad_norm": 0.14009161293506622, |
|
"learning_rate": 2.4175824175824177e-05, |
|
"loss": 0.0876, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.7893368010403121, |
|
"grad_norm": 0.1278422623872757, |
|
"learning_rate": 2.1978021978021977e-05, |
|
"loss": 0.0948, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.8101430429128738, |
|
"grad_norm": 0.16997836530208588, |
|
"learning_rate": 1.978021978021978e-05, |
|
"loss": 0.0881, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.8309492847854356, |
|
"grad_norm": 0.17075221240520477, |
|
"learning_rate": 1.7582417582417584e-05, |
|
"loss": 0.0838, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.8517555266579975, |
|
"grad_norm": 0.13163046538829803, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.0868, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.8725617685305593, |
|
"grad_norm": 0.12678372859954834, |
|
"learning_rate": 1.3186813186813187e-05, |
|
"loss": 0.0894, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.893368010403121, |
|
"grad_norm": 0.15091709792613983, |
|
"learning_rate": 1.0989010989010989e-05, |
|
"loss": 0.092, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.9141742522756826, |
|
"grad_norm": 0.13614249229431152, |
|
"learning_rate": 8.791208791208792e-06, |
|
"loss": 0.0829, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.9349804941482445, |
|
"grad_norm": 0.13656853139400482, |
|
"learning_rate": 6.5934065934065935e-06, |
|
"loss": 0.0892, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.9557867360208063, |
|
"grad_norm": 0.140973761677742, |
|
"learning_rate": 4.395604395604396e-06, |
|
"loss": 0.0848, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.976592977893368, |
|
"grad_norm": 0.1271371990442276, |
|
"learning_rate": 2.197802197802198e-06, |
|
"loss": 0.0757, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.9973992197659298, |
|
"grad_norm": 0.17182095348834991, |
|
"learning_rate": 0.0, |
|
"loss": 0.0872, |
|
"step": 384 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 384, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.091116577734861e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|