|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 7.998743529175252, |
|
"eval_steps": 500, |
|
"global_step": 4968, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016082826556767352, |
|
"grad_norm": 93.80207061767578, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 0.8299, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.032165653113534705, |
|
"grad_norm": 50.166954040527344, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 0.7131, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.048248479670302054, |
|
"grad_norm": 37.23706817626953, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.5976, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06433130622706941, |
|
"grad_norm": 37.21980285644531, |
|
"learning_rate": 1.0666666666666667e-05, |
|
"loss": 0.5263, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08041413278383676, |
|
"grad_norm": 29.091915130615234, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.4731, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09649695934060411, |
|
"grad_norm": 32.472801208496094, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.4357, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11257978589737146, |
|
"grad_norm": 29.79865264892578, |
|
"learning_rate": 1.866666666666667e-05, |
|
"loss": 0.3916, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12866261245413882, |
|
"grad_norm": 28.13816261291504, |
|
"learning_rate": 2.1333333333333335e-05, |
|
"loss": 0.3721, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14474543901090617, |
|
"grad_norm": 30.40574073791504, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.3382, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16082826556767352, |
|
"grad_norm": 30.368940353393555, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 0.3207, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17691109212444087, |
|
"grad_norm": 31.629531860351562, |
|
"learning_rate": 2.9333333333333333e-05, |
|
"loss": 0.305, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.19299391868120822, |
|
"grad_norm": 29.47364044189453, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 0.2812, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.20907674523797556, |
|
"grad_norm": 30.890962600708008, |
|
"learning_rate": 3.466666666666667e-05, |
|
"loss": 0.2665, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2251595717947429, |
|
"grad_norm": 31.893320083618164, |
|
"learning_rate": 3.733333333333334e-05, |
|
"loss": 0.2505, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2412423983515103, |
|
"grad_norm": 29.82271957397461, |
|
"learning_rate": 4e-05, |
|
"loss": 0.2404, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.25732522490827764, |
|
"grad_norm": 31.970462799072266, |
|
"learning_rate": 3.9999574828039864e-05, |
|
"loss": 0.2188, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.273408051465045, |
|
"grad_norm": 25.94739532470703, |
|
"learning_rate": 3.999829933023657e-05, |
|
"loss": 0.2156, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.28949087802181234, |
|
"grad_norm": 32.104461669921875, |
|
"learning_rate": 3.9996173560820705e-05, |
|
"loss": 0.2064, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.018596079745351, |
|
"grad_norm": 29.600008010864258, |
|
"learning_rate": 3.999319761017403e-05, |
|
"loss": 0.2122, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0722063997319484, |
|
"grad_norm": 28.94344139099121, |
|
"learning_rate": 3.998937160482562e-05, |
|
"loss": 0.1835, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1258167197185458, |
|
"grad_norm": 27.56523323059082, |
|
"learning_rate": 3.998469570744648e-05, |
|
"loss": 0.1815, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.1794270397051432, |
|
"grad_norm": 28.684629440307617, |
|
"learning_rate": 3.997917011684268e-05, |
|
"loss": 0.1717, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.2330373596917408, |
|
"grad_norm": 27.716796875, |
|
"learning_rate": 3.9972795067946826e-05, |
|
"loss": 0.1615, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.2866476796783382, |
|
"grad_norm": 25.33133316040039, |
|
"learning_rate": 3.996557083180813e-05, |
|
"loss": 0.1585, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.3402579996649355, |
|
"grad_norm": 28.910871505737305, |
|
"learning_rate": 3.9957497715580844e-05, |
|
"loss": 0.1488, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.393868319651533, |
|
"grad_norm": 26.948163986206055, |
|
"learning_rate": 3.994857606251124e-05, |
|
"loss": 0.1448, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.4474786396381303, |
|
"grad_norm": 26.12610626220703, |
|
"learning_rate": 3.993880625192298e-05, |
|
"loss": 0.1411, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.5010889596247279, |
|
"grad_norm": 26.017358779907227, |
|
"learning_rate": 3.9928188699201035e-05, |
|
"loss": 0.1375, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.554699279611325, |
|
"grad_norm": 26.367809295654297, |
|
"learning_rate": 3.991672385577396e-05, |
|
"loss": 0.1272, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.6083095995979226, |
|
"grad_norm": 24.2690372467041, |
|
"learning_rate": 3.9904412209094755e-05, |
|
"loss": 0.1288, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.66191991958452, |
|
"grad_norm": 29.307832717895508, |
|
"learning_rate": 3.9891254282620115e-05, |
|
"loss": 0.1321, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.7155302395711174, |
|
"grad_norm": 27.43846321105957, |
|
"learning_rate": 3.9877250635788184e-05, |
|
"loss": 0.1161, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.769140559557715, |
|
"grad_norm": 23.861331939697266, |
|
"learning_rate": 3.9862401863994744e-05, |
|
"loss": 0.1169, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.8227508795443121, |
|
"grad_norm": 23.86732292175293, |
|
"learning_rate": 3.9846708598567956e-05, |
|
"loss": 0.1123, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.8763611995309097, |
|
"grad_norm": 28.334510803222656, |
|
"learning_rate": 3.983017150674145e-05, |
|
"loss": 0.1042, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.9299715195175071, |
|
"grad_norm": 26.570316314697266, |
|
"learning_rate": 3.9812791291626e-05, |
|
"loss": 0.1069, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.9835818395041045, |
|
"grad_norm": 23.24406623840332, |
|
"learning_rate": 3.979456869217962e-05, |
|
"loss": 0.1074, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.0316112458969604, |
|
"grad_norm": 23.662790298461914, |
|
"learning_rate": 3.977550448317615e-05, |
|
"loss": 0.1278, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.077279862994149, |
|
"grad_norm": 20.232288360595703, |
|
"learning_rate": 3.97555994751723e-05, |
|
"loss": 0.1282, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.1229484800913374, |
|
"grad_norm": 24.64867401123047, |
|
"learning_rate": 3.973485451447318e-05, |
|
"loss": 0.1146, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.168617097188526, |
|
"grad_norm": 25.983293533325195, |
|
"learning_rate": 3.9713270483096374e-05, |
|
"loss": 0.1123, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.2142857142857144, |
|
"grad_norm": 23.095712661743164, |
|
"learning_rate": 3.969084829873436e-05, |
|
"loss": 0.103, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.259954331382903, |
|
"grad_norm": 23.112424850463867, |
|
"learning_rate": 3.966758891471555e-05, |
|
"loss": 0.1065, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.3056229484800914, |
|
"grad_norm": 23.07062339782715, |
|
"learning_rate": 3.964349331996373e-05, |
|
"loss": 0.1021, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.35129156557728, |
|
"grad_norm": 25.471771240234375, |
|
"learning_rate": 3.961856253895603e-05, |
|
"loss": 0.1064, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.3969601826744684, |
|
"grad_norm": 24.417654037475586, |
|
"learning_rate": 3.959279763167935e-05, |
|
"loss": 0.0956, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.442628799771657, |
|
"grad_norm": 24.067520141601562, |
|
"learning_rate": 3.9566199693585304e-05, |
|
"loss": 0.1113, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.4882974168688454, |
|
"grad_norm": 23.70163345336914, |
|
"learning_rate": 3.953876985554364e-05, |
|
"loss": 0.0911, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.533966033966034, |
|
"grad_norm": 23.784481048583984, |
|
"learning_rate": 3.951050928379415e-05, |
|
"loss": 0.0888, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.5796346510632224, |
|
"grad_norm": 23.621828079223633, |
|
"learning_rate": 3.948141917989712e-05, |
|
"loss": 0.0904, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.625303268160411, |
|
"grad_norm": 19.741653442382812, |
|
"learning_rate": 3.945150078068219e-05, |
|
"loss": 0.0879, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.6709718852575994, |
|
"grad_norm": 22.743593215942383, |
|
"learning_rate": 3.9420755358195804e-05, |
|
"loss": 0.0851, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.716640502354788, |
|
"grad_norm": 18.910329818725586, |
|
"learning_rate": 3.938918421964711e-05, |
|
"loss": 0.0801, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.7623091194519764, |
|
"grad_norm": 22.360628128051758, |
|
"learning_rate": 3.9356788707352406e-05, |
|
"loss": 0.078, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.807977736549165, |
|
"grad_norm": 24.22591209411621, |
|
"learning_rate": 3.932357019867803e-05, |
|
"loss": 0.0822, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.8536463536463534, |
|
"grad_norm": 24.46196746826172, |
|
"learning_rate": 3.928953010598183e-05, |
|
"loss": 0.0695, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.899314970743542, |
|
"grad_norm": 24.530746459960938, |
|
"learning_rate": 3.925466987655309e-05, |
|
"loss": 0.082, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.9449835878407304, |
|
"grad_norm": 23.36806297302246, |
|
"learning_rate": 3.921899099255104e-05, |
|
"loss": 0.0751, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.9906522049379194, |
|
"grad_norm": 19.65323257446289, |
|
"learning_rate": 3.918249497094176e-05, |
|
"loss": 0.07, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.02390615940413, |
|
"grad_norm": 17.223087310791016, |
|
"learning_rate": 3.9145183363433777e-05, |
|
"loss": 0.0662, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.05245082734936, |
|
"grad_norm": 19.922897338867188, |
|
"learning_rate": 3.9107057756411995e-05, |
|
"loss": 0.0695, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.08099549529459, |
|
"grad_norm": 22.456689834594727, |
|
"learning_rate": 3.906811977087035e-05, |
|
"loss": 0.0574, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.10954016323982, |
|
"grad_norm": 18.3155460357666, |
|
"learning_rate": 3.902837106234278e-05, |
|
"loss": 0.0638, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 3.1380848311850498, |
|
"grad_norm": 19.499990463256836, |
|
"learning_rate": 3.8987813320832935e-05, |
|
"loss": 0.0663, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.1666294991302797, |
|
"grad_norm": 18.689781188964844, |
|
"learning_rate": 3.894644827074225e-05, |
|
"loss": 0.0583, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.1951741670755096, |
|
"grad_norm": 19.73504066467285, |
|
"learning_rate": 3.890427767079667e-05, |
|
"loss": 0.062, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.2237188350207395, |
|
"grad_norm": 19.44004249572754, |
|
"learning_rate": 3.886130331397186e-05, |
|
"loss": 0.0577, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.2522635029659694, |
|
"grad_norm": 19.139127731323242, |
|
"learning_rate": 3.881752702741697e-05, |
|
"loss": 0.0618, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.2808081709111994, |
|
"grad_norm": 21.88005828857422, |
|
"learning_rate": 3.877295067237697e-05, |
|
"loss": 0.059, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.3093528388564293, |
|
"grad_norm": 24.21089744567871, |
|
"learning_rate": 3.872757614411346e-05, |
|
"loss": 0.0593, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.337897506801659, |
|
"grad_norm": 20.264284133911133, |
|
"learning_rate": 3.868140537182417e-05, |
|
"loss": 0.054, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.366442174746889, |
|
"grad_norm": 21.731857299804688, |
|
"learning_rate": 3.863444031856088e-05, |
|
"loss": 0.062, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.394986842692119, |
|
"grad_norm": 21.47838592529297, |
|
"learning_rate": 3.8586682981145956e-05, |
|
"loss": 0.0552, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.423531510637349, |
|
"grad_norm": 18.726280212402344, |
|
"learning_rate": 3.853813539008746e-05, |
|
"loss": 0.0532, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.452076178582579, |
|
"grad_norm": 19.791046142578125, |
|
"learning_rate": 3.848879960949287e-05, |
|
"loss": 0.0558, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.480620846527809, |
|
"grad_norm": 18.885759353637695, |
|
"learning_rate": 3.8438677736981215e-05, |
|
"loss": 0.0553, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.5091655144730387, |
|
"grad_norm": 16.527170181274414, |
|
"learning_rate": 3.838777190359397e-05, |
|
"loss": 0.0476, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.5377101824182686, |
|
"grad_norm": 16.75018310546875, |
|
"learning_rate": 3.8336084273704457e-05, |
|
"loss": 0.0532, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.5662548503634985, |
|
"grad_norm": 18.81423568725586, |
|
"learning_rate": 3.828361704492575e-05, |
|
"loss": 0.0499, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.5947995183087285, |
|
"grad_norm": 19.174463272094727, |
|
"learning_rate": 3.823037244801729e-05, |
|
"loss": 0.0494, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.6233441862539584, |
|
"grad_norm": 17.0285701751709, |
|
"learning_rate": 3.817635274679006e-05, |
|
"loss": 0.0461, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.6518888541991883, |
|
"grad_norm": 17.395580291748047, |
|
"learning_rate": 3.812156023801028e-05, |
|
"loss": 0.0496, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.680433522144418, |
|
"grad_norm": 18.277786254882812, |
|
"learning_rate": 3.8065997251301776e-05, |
|
"loss": 0.0477, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.708978190089648, |
|
"grad_norm": 17.72475242614746, |
|
"learning_rate": 3.8009666149046957e-05, |
|
"loss": 0.0457, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.737522858034878, |
|
"grad_norm": 20.809040069580078, |
|
"learning_rate": 3.7952569326286336e-05, |
|
"loss": 0.0471, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.766067525980108, |
|
"grad_norm": 17.868568420410156, |
|
"learning_rate": 3.7894709210616714e-05, |
|
"loss": 0.0456, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.794612193925338, |
|
"grad_norm": 15.575334548950195, |
|
"learning_rate": 3.7836088262087975e-05, |
|
"loss": 0.044, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.823156861870568, |
|
"grad_norm": 17.568668365478516, |
|
"learning_rate": 3.7776708973098476e-05, |
|
"loss": 0.0446, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.8517015298157977, |
|
"grad_norm": 17.17595672607422, |
|
"learning_rate": 3.771657386828908e-05, |
|
"loss": 0.0496, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.8802461977610276, |
|
"grad_norm": 24.375370025634766, |
|
"learning_rate": 3.765568550443583e-05, |
|
"loss": 0.0424, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.9087908657062576, |
|
"grad_norm": 16.25655174255371, |
|
"learning_rate": 3.7594046470341246e-05, |
|
"loss": 0.046, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.9373355336514875, |
|
"grad_norm": 18.85159683227539, |
|
"learning_rate": 3.7531659386724195e-05, |
|
"loss": 0.0435, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.9658802015967174, |
|
"grad_norm": 19.97796058654785, |
|
"learning_rate": 3.746852690610855e-05, |
|
"loss": 0.0431, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.9944248695419473, |
|
"grad_norm": 15.388335227966309, |
|
"learning_rate": 3.7404651712710365e-05, |
|
"loss": 0.0389, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.019183642211671, |
|
"grad_norm": 20.02805519104004, |
|
"learning_rate": 3.734003652232376e-05, |
|
"loss": 0.039, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.043689692142748, |
|
"grad_norm": 15.568504333496094, |
|
"learning_rate": 3.727468408220544e-05, |
|
"loss": 0.0375, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 4.068195742073825, |
|
"grad_norm": 15.18822956085205, |
|
"learning_rate": 3.720859717095792e-05, |
|
"loss": 0.0365, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.092701792004902, |
|
"grad_norm": 14.499895095825195, |
|
"learning_rate": 3.714177859841136e-05, |
|
"loss": 0.038, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 4.117207841935978, |
|
"grad_norm": 18.488901138305664, |
|
"learning_rate": 3.707423120550411e-05, |
|
"loss": 0.0406, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.141713891867055, |
|
"grad_norm": 16.12656593322754, |
|
"learning_rate": 3.7005957864161905e-05, |
|
"loss": 0.0354, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.166219941798132, |
|
"grad_norm": 18.07503318786621, |
|
"learning_rate": 3.693696147717579e-05, |
|
"loss": 0.0373, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 4.190725991729209, |
|
"grad_norm": 17.39132308959961, |
|
"learning_rate": 3.686724497807867e-05, |
|
"loss": 0.0345, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 4.215232041660285, |
|
"grad_norm": 15.007177352905273, |
|
"learning_rate": 3.67968113310206e-05, |
|
"loss": 0.0325, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 4.239738091591361, |
|
"grad_norm": 15.444381713867188, |
|
"learning_rate": 3.6725663530642755e-05, |
|
"loss": 0.0327, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 4.264244141522438, |
|
"grad_norm": 14.16204833984375, |
|
"learning_rate": 3.6653804601950126e-05, |
|
"loss": 0.0338, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.288750191453515, |
|
"grad_norm": 16.405170440673828, |
|
"learning_rate": 3.6581237600182856e-05, |
|
"loss": 0.0342, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.313256241384591, |
|
"grad_norm": 19.641298294067383, |
|
"learning_rate": 3.650796561068639e-05, |
|
"loss": 0.0394, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.337762291315668, |
|
"grad_norm": 14.00063705444336, |
|
"learning_rate": 3.6433991748780255e-05, |
|
"loss": 0.0336, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.362268341246745, |
|
"grad_norm": 13.914216995239258, |
|
"learning_rate": 3.635931915962565e-05, |
|
"loss": 0.0326, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.386774391177822, |
|
"grad_norm": 15.238022804260254, |
|
"learning_rate": 3.628395101809169e-05, |
|
"loss": 0.0312, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.411280441108898, |
|
"grad_norm": 15.279886245727539, |
|
"learning_rate": 3.62078905286204e-05, |
|
"loss": 0.0313, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.435786491039975, |
|
"grad_norm": 15.173819541931152, |
|
"learning_rate": 3.613114092509054e-05, |
|
"loss": 0.0315, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 4.460292540971052, |
|
"grad_norm": 15.986420631408691, |
|
"learning_rate": 3.6053705470680044e-05, |
|
"loss": 0.0333, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 4.484798590902129, |
|
"grad_norm": 18.724811553955078, |
|
"learning_rate": 3.59755874577273e-05, |
|
"loss": 0.0322, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 4.509304640833205, |
|
"grad_norm": 14.428422927856445, |
|
"learning_rate": 3.589679020759118e-05, |
|
"loss": 0.0278, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 4.533810690764282, |
|
"grad_norm": 14.249613761901855, |
|
"learning_rate": 3.5817317070509814e-05, |
|
"loss": 0.0323, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 4.558316740695359, |
|
"grad_norm": 13.707551002502441, |
|
"learning_rate": 3.573717142545814e-05, |
|
"loss": 0.0299, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 4.582822790626436, |
|
"grad_norm": 18.068727493286133, |
|
"learning_rate": 3.565635668000427e-05, |
|
"loss": 0.0319, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 4.607328840557512, |
|
"grad_norm": 15.44510269165039, |
|
"learning_rate": 3.557487627016458e-05, |
|
"loss": 0.0308, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 4.631834890488589, |
|
"grad_norm": 15.211899757385254, |
|
"learning_rate": 3.5492733660257605e-05, |
|
"loss": 0.029, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.656340940419666, |
|
"grad_norm": 18.195812225341797, |
|
"learning_rate": 3.5409932342756824e-05, |
|
"loss": 0.029, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 4.680846990350743, |
|
"grad_norm": 15.29293155670166, |
|
"learning_rate": 3.532647583814205e-05, |
|
"loss": 0.0275, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 4.705353040281819, |
|
"grad_norm": 13.911247253417969, |
|
"learning_rate": 3.524236769474987e-05, |
|
"loss": 0.0259, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 4.729859090212896, |
|
"grad_norm": 15.558411598205566, |
|
"learning_rate": 3.51576114886227e-05, |
|
"loss": 0.0287, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 4.754365140143973, |
|
"grad_norm": 16.093111038208008, |
|
"learning_rate": 3.507221082335676e-05, |
|
"loss": 0.0293, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 4.77887119007505, |
|
"grad_norm": 13.53354549407959, |
|
"learning_rate": 3.498616932994888e-05, |
|
"loss": 0.0278, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 4.803377240006126, |
|
"grad_norm": 22.743614196777344, |
|
"learning_rate": 3.489949066664211e-05, |
|
"loss": 0.034, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 4.827883289937203, |
|
"grad_norm": 14.596455574035645, |
|
"learning_rate": 3.481217851877015e-05, |
|
"loss": 0.0292, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 4.85238933986828, |
|
"grad_norm": 17.450109481811523, |
|
"learning_rate": 3.4724236598600725e-05, |
|
"loss": 0.0301, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.8768953897993566, |
|
"grad_norm": 15.233014106750488, |
|
"learning_rate": 3.4635668645177674e-05, |
|
"loss": 0.0292, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.901401439730433, |
|
"grad_norm": 15.098063468933105, |
|
"learning_rate": 3.454647842416204e-05, |
|
"loss": 0.0276, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.92590748966151, |
|
"grad_norm": 16.780668258666992, |
|
"learning_rate": 3.4456669727671944e-05, |
|
"loss": 0.027, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.950413539592587, |
|
"grad_norm": 16.340227127075195, |
|
"learning_rate": 3.436624637412132e-05, |
|
"loss": 0.0309, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 4.9749195895236635, |
|
"grad_norm": 12.311773300170898, |
|
"learning_rate": 3.427521220805763e-05, |
|
"loss": 0.0257, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.99942563945474, |
|
"grad_norm": 15.815475463867188, |
|
"learning_rate": 3.4183571099998355e-05, |
|
"loss": 0.0261, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 5.02021921165498, |
|
"grad_norm": 12.780634880065918, |
|
"learning_rate": 3.409132694626643e-05, |
|
"loss": 0.0281, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 5.042004220845531, |
|
"grad_norm": 14.720085144042969, |
|
"learning_rate": 3.3998483668824645e-05, |
|
"loss": 0.0236, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 5.063789230036082, |
|
"grad_norm": 16.020496368408203, |
|
"learning_rate": 3.390504521510882e-05, |
|
"loss": 0.0241, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 5.0855742392266325, |
|
"grad_norm": 13.678121566772461, |
|
"learning_rate": 3.381101555785999e-05, |
|
"loss": 0.0232, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 5.107359248417183, |
|
"grad_norm": 13.695241928100586, |
|
"learning_rate": 3.371639869495554e-05, |
|
"loss": 0.0237, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 5.1291442576077335, |
|
"grad_norm": 11.553495407104492, |
|
"learning_rate": 3.362119864923918e-05, |
|
"loss": 0.0237, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 5.1509292667982844, |
|
"grad_norm": 12.397970199584961, |
|
"learning_rate": 3.35254194683499e-05, |
|
"loss": 0.0236, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 5.172714275988835, |
|
"grad_norm": 14.61323356628418, |
|
"learning_rate": 3.342906522454992e-05, |
|
"loss": 0.0239, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 5.194499285179386, |
|
"grad_norm": 11.521512031555176, |
|
"learning_rate": 3.333214001455149e-05, |
|
"loss": 0.0191, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 5.216284294369936, |
|
"grad_norm": 15.835281372070312, |
|
"learning_rate": 3.323464795934279e-05, |
|
"loss": 0.0253, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 5.238069303560487, |
|
"grad_norm": 14.1622953414917, |
|
"learning_rate": 3.313659320401263e-05, |
|
"loss": 0.0243, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 5.259854312751038, |
|
"grad_norm": 12.993020057678223, |
|
"learning_rate": 3.303797991757425e-05, |
|
"loss": 0.0211, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 5.281639321941589, |
|
"grad_norm": 13.310782432556152, |
|
"learning_rate": 3.29388122927881e-05, |
|
"loss": 0.0278, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 5.30342433113214, |
|
"grad_norm": 17.85926628112793, |
|
"learning_rate": 3.2839094545983505e-05, |
|
"loss": 0.0212, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 5.32520934032269, |
|
"grad_norm": 12.155655860900879, |
|
"learning_rate": 3.273883091687946e-05, |
|
"loss": 0.0224, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 5.346994349513241, |
|
"grad_norm": 10.895421981811523, |
|
"learning_rate": 3.2638025668404334e-05, |
|
"loss": 0.0241, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 5.368779358703792, |
|
"grad_norm": 12.233269691467285, |
|
"learning_rate": 3.2536683086514634e-05, |
|
"loss": 0.0206, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 5.390564367894343, |
|
"grad_norm": 12.179084777832031, |
|
"learning_rate": 3.243480748001278e-05, |
|
"loss": 0.0241, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 5.412349377084894, |
|
"grad_norm": 13.7705078125, |
|
"learning_rate": 3.2332403180363906e-05, |
|
"loss": 0.0253, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 5.434134386275444, |
|
"grad_norm": 10.06460952758789, |
|
"learning_rate": 3.222947454151169e-05, |
|
"loss": 0.0249, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 5.455919395465995, |
|
"grad_norm": 16.193252563476562, |
|
"learning_rate": 3.212602593969325e-05, |
|
"loss": 0.0245, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 5.477704404656546, |
|
"grad_norm": 11.988511085510254, |
|
"learning_rate": 3.202206177325306e-05, |
|
"loss": 0.0238, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 5.499489413847097, |
|
"grad_norm": 11.607162475585938, |
|
"learning_rate": 3.191758646245596e-05, |
|
"loss": 0.0226, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 5.521274423037648, |
|
"grad_norm": 12.626535415649414, |
|
"learning_rate": 3.181260444929923e-05, |
|
"loss": 0.0204, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 5.543059432228198, |
|
"grad_norm": 12.591373443603516, |
|
"learning_rate": 3.1707120197323686e-05, |
|
"loss": 0.0207, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 5.564844441418749, |
|
"grad_norm": 12.233884811401367, |
|
"learning_rate": 3.1601138191423966e-05, |
|
"loss": 0.0223, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 5.5866294506092995, |
|
"grad_norm": 13.553182601928711, |
|
"learning_rate": 3.149466293765778e-05, |
|
"loss": 0.021, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 5.60841445979985, |
|
"grad_norm": 13.30004596710205, |
|
"learning_rate": 3.138769896305434e-05, |
|
"loss": 0.0188, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 5.630199468990401, |
|
"grad_norm": 13.62940502166748, |
|
"learning_rate": 3.128025081542196e-05, |
|
"loss": 0.0176, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 5.651984478180951, |
|
"grad_norm": 12.465331077575684, |
|
"learning_rate": 3.117232306315456e-05, |
|
"loss": 0.0195, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 5.673769487371502, |
|
"grad_norm": 12.430222511291504, |
|
"learning_rate": 3.106392029503757e-05, |
|
"loss": 0.0216, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 5.695554496562053, |
|
"grad_norm": 12.926973342895508, |
|
"learning_rate": 3.09550471200527e-05, |
|
"loss": 0.0192, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 5.717339505752604, |
|
"grad_norm": 13.914267539978027, |
|
"learning_rate": 3.08457081671821e-05, |
|
"loss": 0.021, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 5.739124514943155, |
|
"grad_norm": 13.50471019744873, |
|
"learning_rate": 3.073590808521144e-05, |
|
"loss": 0.0218, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 5.760909524133705, |
|
"grad_norm": 10.271846771240234, |
|
"learning_rate": 3.062565154253233e-05, |
|
"loss": 0.0202, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.782694533324256, |
|
"grad_norm": 14.907472610473633, |
|
"learning_rate": 3.0514943226943816e-05, |
|
"loss": 0.0236, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 5.804479542514807, |
|
"grad_norm": 13.36329174041748, |
|
"learning_rate": 3.040378784545304e-05, |
|
"loss": 0.021, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 5.826264551705358, |
|
"grad_norm": 11.128992080688477, |
|
"learning_rate": 3.0292190124075162e-05, |
|
"loss": 0.0176, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 5.848049560895909, |
|
"grad_norm": 11.310523986816406, |
|
"learning_rate": 3.018015480763236e-05, |
|
"loss": 0.0207, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 5.869834570086459, |
|
"grad_norm": 11.527318000793457, |
|
"learning_rate": 3.006768665955215e-05, |
|
"loss": 0.0187, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 5.89161957927701, |
|
"grad_norm": 11.697135925292969, |
|
"learning_rate": 2.9954790461664834e-05, |
|
"loss": 0.0202, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 5.913404588467561, |
|
"grad_norm": 10.337966918945312, |
|
"learning_rate": 2.984147101400018e-05, |
|
"loss": 0.0168, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 5.935189597658112, |
|
"grad_norm": 10.729581832885742, |
|
"learning_rate": 2.9727733134583358e-05, |
|
"loss": 0.021, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 5.956974606848663, |
|
"grad_norm": 11.193035125732422, |
|
"learning_rate": 2.961358165923008e-05, |
|
"loss": 0.0203, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 5.978759616039213, |
|
"grad_norm": 11.645442962646484, |
|
"learning_rate": 2.9499021441341012e-05, |
|
"loss": 0.0182, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 6.000544625229764, |
|
"grad_norm": 11.237954139709473, |
|
"learning_rate": 2.938405735169537e-05, |
|
"loss": 0.0184, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 6.019290314590042, |
|
"grad_norm": 10.546935081481934, |
|
"learning_rate": 2.9268694278243903e-05, |
|
"loss": 0.0179, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 6.039107010156057, |
|
"grad_norm": 9.956415176391602, |
|
"learning_rate": 2.915293712590102e-05, |
|
"loss": 0.0196, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 6.058923705722071, |
|
"grad_norm": 9.116511344909668, |
|
"learning_rate": 2.9036790816336252e-05, |
|
"loss": 0.0199, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 6.078740401288085, |
|
"grad_norm": 16.642379760742188, |
|
"learning_rate": 2.892026028776501e-05, |
|
"loss": 0.0173, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 6.098557096854099, |
|
"grad_norm": 11.179176330566406, |
|
"learning_rate": 2.8803350494738615e-05, |
|
"loss": 0.019, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 6.118373792420114, |
|
"grad_norm": 13.457623481750488, |
|
"learning_rate": 2.8686066407933656e-05, |
|
"loss": 0.0164, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 6.138190487986129, |
|
"grad_norm": 11.937878608703613, |
|
"learning_rate": 2.8568413013940642e-05, |
|
"loss": 0.019, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 6.158007183552143, |
|
"grad_norm": 14.586573600769043, |
|
"learning_rate": 2.845039531505199e-05, |
|
"loss": 0.0187, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 6.177823879118157, |
|
"grad_norm": 10.834576606750488, |
|
"learning_rate": 2.833201832904933e-05, |
|
"loss": 0.0205, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 6.197640574684171, |
|
"grad_norm": 10.595796585083008, |
|
"learning_rate": 2.8213287088990184e-05, |
|
"loss": 0.0194, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 6.217457270250185, |
|
"grad_norm": 12.627229690551758, |
|
"learning_rate": 2.8094206642993955e-05, |
|
"loss": 0.0145, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.1077549379303413, |
|
"grad_norm": 10.363633155822754, |
|
"learning_rate": 2.7974782054027308e-05, |
|
"loss": 0.0179, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.1238377644871087, |
|
"grad_norm": 14.35067081451416, |
|
"learning_rate": 2.7855018399688908e-05, |
|
"loss": 0.0184, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.139920591043876, |
|
"grad_norm": 10.560155868530273, |
|
"learning_rate": 2.773492077199351e-05, |
|
"loss": 0.0173, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.1560034176006435, |
|
"grad_norm": 10.620994567871094, |
|
"learning_rate": 2.76144942771555e-05, |
|
"loss": 0.0155, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.1720862441574105, |
|
"grad_norm": 9.053291320800781, |
|
"learning_rate": 2.749374403537177e-05, |
|
"loss": 0.0145, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.188169070714178, |
|
"grad_norm": 12.468178749084473, |
|
"learning_rate": 2.7372675180603994e-05, |
|
"loss": 0.0183, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.2042518972709453, |
|
"grad_norm": 8.465781211853027, |
|
"learning_rate": 2.7251292860360424e-05, |
|
"loss": 0.0164, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.2203347238277127, |
|
"grad_norm": 10.253599166870117, |
|
"learning_rate": 2.712960223547696e-05, |
|
"loss": 0.015, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.23641755038448, |
|
"grad_norm": 9.734599113464355, |
|
"learning_rate": 2.700760847989775e-05, |
|
"loss": 0.0144, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.2525003769412475, |
|
"grad_norm": 12.44884967803955, |
|
"learning_rate": 2.6885316780455208e-05, |
|
"loss": 0.0129, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.268583203498015, |
|
"grad_norm": 10.425430297851562, |
|
"learning_rate": 2.6762732336649492e-05, |
|
"loss": 0.0185, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.2846660300547823, |
|
"grad_norm": 10.850104331970215, |
|
"learning_rate": 2.6639860360427426e-05, |
|
"loss": 0.0143, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.3007488566115493, |
|
"grad_norm": 9.267366409301758, |
|
"learning_rate": 2.651670607596092e-05, |
|
"loss": 0.0146, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.3168316831683167, |
|
"grad_norm": 9.598543167114258, |
|
"learning_rate": 2.6393274719424814e-05, |
|
"loss": 0.0157, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.332914509725084, |
|
"grad_norm": 9.140937805175781, |
|
"learning_rate": 2.6269571538774294e-05, |
|
"loss": 0.0172, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.3489973362818515, |
|
"grad_norm": 10.654680252075195, |
|
"learning_rate": 2.6145601793521734e-05, |
|
"loss": 0.0162, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.365080162838619, |
|
"grad_norm": 10.139638900756836, |
|
"learning_rate": 2.6021370754513096e-05, |
|
"loss": 0.0168, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.3811629893953863, |
|
"grad_norm": 9.781733512878418, |
|
"learning_rate": 2.589688370370382e-05, |
|
"loss": 0.0165, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.3972458159521537, |
|
"grad_norm": 10.93750286102295, |
|
"learning_rate": 2.5772145933934235e-05, |
|
"loss": 0.0145, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.413328642508921, |
|
"grad_norm": 11.465789794921875, |
|
"learning_rate": 2.5647162748704562e-05, |
|
"loss": 0.0135, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.4294114690656885, |
|
"grad_norm": 9.151410102844238, |
|
"learning_rate": 2.5521939461949384e-05, |
|
"loss": 0.0163, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.4454942956224555, |
|
"grad_norm": 8.722734451293945, |
|
"learning_rate": 2.5396481397811715e-05, |
|
"loss": 0.0171, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.461577122179223, |
|
"grad_norm": 9.99445629119873, |
|
"learning_rate": 2.5270793890416677e-05, |
|
"loss": 0.0146, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.4776599487359903, |
|
"grad_norm": 11.865700721740723, |
|
"learning_rate": 2.5144882283644644e-05, |
|
"loss": 0.0172, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.4937427752927577, |
|
"grad_norm": 14.123621940612793, |
|
"learning_rate": 2.50187519309041e-05, |
|
"loss": 0.0146, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.509825601849525, |
|
"grad_norm": 10.353002548217773, |
|
"learning_rate": 2.4892408194903963e-05, |
|
"loss": 0.0155, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.5259084284062925, |
|
"grad_norm": 10.808701515197754, |
|
"learning_rate": 2.4765856447425614e-05, |
|
"loss": 0.0133, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.54199125496306, |
|
"grad_norm": 8.521575927734375, |
|
"learning_rate": 2.4639102069094522e-05, |
|
"loss": 0.0125, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.558074081519827, |
|
"grad_norm": 7.443869113922119, |
|
"learning_rate": 2.4512150449151433e-05, |
|
"loss": 0.0143, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.5741569080765947, |
|
"grad_norm": 10.696161270141602, |
|
"learning_rate": 2.438500698522325e-05, |
|
"loss": 0.0176, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.5902397346333617, |
|
"grad_norm": 19.051715850830078, |
|
"learning_rate": 2.4257677083093553e-05, |
|
"loss": 0.0167, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.606322561190129, |
|
"grad_norm": 13.287577629089355, |
|
"learning_rate": 2.413016615647275e-05, |
|
"loss": 0.0173, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.6224053877468965, |
|
"grad_norm": 10.227944374084473, |
|
"learning_rate": 2.4002479626767903e-05, |
|
"loss": 0.0153, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.638488214303664, |
|
"grad_norm": 10.98816204071045, |
|
"learning_rate": 2.3874622922852225e-05, |
|
"loss": 0.0136, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.6545710408604313, |
|
"grad_norm": 9.841387748718262, |
|
"learning_rate": 2.3746601480834258e-05, |
|
"loss": 0.0164, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.6706538674171987, |
|
"grad_norm": 9.20659351348877, |
|
"learning_rate": 2.361842074382674e-05, |
|
"loss": 0.0133, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.686736693973966, |
|
"grad_norm": 8.02350902557373, |
|
"learning_rate": 2.3490086161715197e-05, |
|
"loss": 0.0113, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.702819520530733, |
|
"grad_norm": 7.091673851013184, |
|
"learning_rate": 2.336160319092621e-05, |
|
"loss": 0.0127, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.718902347087501, |
|
"grad_norm": 9.675036430358887, |
|
"learning_rate": 2.3232977294195437e-05, |
|
"loss": 0.0195, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 3.734985173644268, |
|
"grad_norm": 8.969931602478027, |
|
"learning_rate": 2.3104213940335338e-05, |
|
"loss": 0.0118, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.7510680002010353, |
|
"grad_norm": 11.032634735107422, |
|
"learning_rate": 2.2975318604002667e-05, |
|
"loss": 0.0148, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 3.7671508267578027, |
|
"grad_norm": 8.483484268188477, |
|
"learning_rate": 2.2846296765465708e-05, |
|
"loss": 0.013, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.78323365331457, |
|
"grad_norm": 12.28922176361084, |
|
"learning_rate": 2.271715391037126e-05, |
|
"loss": 0.0123, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.7993164798713375, |
|
"grad_norm": 11.338895797729492, |
|
"learning_rate": 2.2587895529511396e-05, |
|
"loss": 0.0134, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.815399306428105, |
|
"grad_norm": 12.580510139465332, |
|
"learning_rate": 2.245852711859004e-05, |
|
"loss": 0.0132, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 3.8314821329848723, |
|
"grad_norm": 9.075506210327148, |
|
"learning_rate": 2.232905417798929e-05, |
|
"loss": 0.0148, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.8475649595416392, |
|
"grad_norm": 8.931917190551758, |
|
"learning_rate": 2.2199482212535522e-05, |
|
"loss": 0.0128, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 3.8636477860984066, |
|
"grad_norm": 12.407690048217773, |
|
"learning_rate": 2.206981673126539e-05, |
|
"loss": 0.0168, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.879730612655174, |
|
"grad_norm": 10.473590850830078, |
|
"learning_rate": 2.1940063247191582e-05, |
|
"loss": 0.0128, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 4.002587467656654, |
|
"grad_norm": 7.4311203956604, |
|
"learning_rate": 2.181022727706842e-05, |
|
"loss": 0.0122, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 4.020587242659467, |
|
"grad_norm": 7.310738563537598, |
|
"learning_rate": 2.168031434115729e-05, |
|
"loss": 0.0067, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 4.038587017662279, |
|
"grad_norm": 13.81872272491455, |
|
"learning_rate": 2.1550329962991946e-05, |
|
"loss": 0.008, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 4.056586792665092, |
|
"grad_norm": 7.483142852783203, |
|
"learning_rate": 2.142027966914368e-05, |
|
"loss": 0.0072, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.074586567667904, |
|
"grad_norm": 7.784074783325195, |
|
"learning_rate": 2.1290168988986332e-05, |
|
"loss": 0.0073, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 4.0925863426707165, |
|
"grad_norm": 7.706643581390381, |
|
"learning_rate": 2.116000345446118e-05, |
|
"loss": 0.0074, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 4.110586117673529, |
|
"grad_norm": 8.789175987243652, |
|
"learning_rate": 2.1029788599841784e-05, |
|
"loss": 0.0077, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 4.128585892676342, |
|
"grad_norm": 6.578799724578857, |
|
"learning_rate": 2.0899529961498633e-05, |
|
"loss": 0.0074, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 4.146585667679154, |
|
"grad_norm": 6.066508769989014, |
|
"learning_rate": 2.076923307766379e-05, |
|
"loss": 0.0071, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.164585442681966, |
|
"grad_norm": 8.597545623779297, |
|
"learning_rate": 2.0638903488195406e-05, |
|
"loss": 0.0074, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 4.182585217684779, |
|
"grad_norm": 9.465729713439941, |
|
"learning_rate": 2.050854673434217e-05, |
|
"loss": 0.0077, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 4.200584992687592, |
|
"grad_norm": 9.023458480834961, |
|
"learning_rate": 2.037816835850776e-05, |
|
"loss": 0.0076, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 4.218584767690404, |
|
"grad_norm": 7.603120803833008, |
|
"learning_rate": 2.024777390401512e-05, |
|
"loss": 0.0076, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 4.236584542693216, |
|
"grad_norm": 7.11006498336792, |
|
"learning_rate": 2.0117368914870838e-05, |
|
"loss": 0.0079, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.254584317696029, |
|
"grad_norm": 9.57967758178711, |
|
"learning_rate": 1.9986958935529393e-05, |
|
"loss": 0.0082, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 4.272584092698842, |
|
"grad_norm": 10.809629440307617, |
|
"learning_rate": 1.9856549510657447e-05, |
|
"loss": 0.0086, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 4.2905838677016535, |
|
"grad_norm": 9.166589736938477, |
|
"learning_rate": 1.9726146184898066e-05, |
|
"loss": 0.0075, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 4.308583642704466, |
|
"grad_norm": 7.303510665893555, |
|
"learning_rate": 1.959575450263503e-05, |
|
"loss": 0.0076, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 4.326583417707279, |
|
"grad_norm": 6.71479606628418, |
|
"learning_rate": 1.9465380007757043e-05, |
|
"loss": 0.0076, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.3445831927100915, |
|
"grad_norm": 6.198269367218018, |
|
"learning_rate": 1.933502824342205e-05, |
|
"loss": 0.0071, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 4.362582967712903, |
|
"grad_norm": 6.006600856781006, |
|
"learning_rate": 1.9204704751821586e-05, |
|
"loss": 0.0072, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 4.380582742715716, |
|
"grad_norm": 6.971927165985107, |
|
"learning_rate": 1.907441507394507e-05, |
|
"loss": 0.0076, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 4.398582517718529, |
|
"grad_norm": 8.39477825164795, |
|
"learning_rate": 1.894416474934429e-05, |
|
"loss": 0.0075, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 4.416582292721341, |
|
"grad_norm": 7.6670355796813965, |
|
"learning_rate": 1.8813959315897815e-05, |
|
"loss": 0.0083, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.434582067724153, |
|
"grad_norm": 7.985522747039795, |
|
"learning_rate": 1.8683804309575587e-05, |
|
"loss": 0.0075, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 4.452581842726966, |
|
"grad_norm": 7.552544593811035, |
|
"learning_rate": 1.855370526420352e-05, |
|
"loss": 0.0073, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 4.470581617729779, |
|
"grad_norm": 7.256811618804932, |
|
"learning_rate": 1.842366771122823e-05, |
|
"loss": 0.0066, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 4.488581392732591, |
|
"grad_norm": 7.66050386428833, |
|
"learning_rate": 1.829369717948185e-05, |
|
"loss": 0.0078, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 4.506581167735403, |
|
"grad_norm": 6.683782577514648, |
|
"learning_rate": 1.8163799194946938e-05, |
|
"loss": 0.0079, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.524580942738216, |
|
"grad_norm": 6.800795078277588, |
|
"learning_rate": 1.8033979280521584e-05, |
|
"loss": 0.0069, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 4.5425807177410285, |
|
"grad_norm": 8.025465965270996, |
|
"learning_rate": 1.790424295578453e-05, |
|
"loss": 0.0069, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.56058049274384, |
|
"grad_norm": 10.645038604736328, |
|
"learning_rate": 1.777459573676051e-05, |
|
"loss": 0.0076, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 4.578580267746653, |
|
"grad_norm": 9.160017013549805, |
|
"learning_rate": 1.764504313568577e-05, |
|
"loss": 0.0068, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.596580042749466, |
|
"grad_norm": 8.349514961242676, |
|
"learning_rate": 1.7515590660773633e-05, |
|
"loss": 0.0076, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.614579817752278, |
|
"grad_norm": 5.219119071960449, |
|
"learning_rate": 1.7386243815980354e-05, |
|
"loss": 0.0073, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 4.632579592755091, |
|
"grad_norm": 7.130075931549072, |
|
"learning_rate": 1.7257008100771072e-05, |
|
"loss": 0.007, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 4.650579367757903, |
|
"grad_norm": 6.3263115882873535, |
|
"learning_rate": 1.7127889009886036e-05, |
|
"loss": 0.0067, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.6685791427607155, |
|
"grad_norm": 6.6792778968811035, |
|
"learning_rate": 1.699889203310695e-05, |
|
"loss": 0.0075, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 4.686578917763528, |
|
"grad_norm": 6.040131092071533, |
|
"learning_rate": 1.6870022655023544e-05, |
|
"loss": 0.0072, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.70457869276634, |
|
"grad_norm": 6.8368306159973145, |
|
"learning_rate": 1.674128635480044e-05, |
|
"loss": 0.0071, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 4.722578467769153, |
|
"grad_norm": 8.718803405761719, |
|
"learning_rate": 1.6612688605944133e-05, |
|
"loss": 0.0074, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 4.740578242771965, |
|
"grad_norm": 8.861642837524414, |
|
"learning_rate": 1.6484234876070335e-05, |
|
"loss": 0.0063, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 4.758578017774778, |
|
"grad_norm": 6.4469475746154785, |
|
"learning_rate": 1.6355930626671447e-05, |
|
"loss": 0.007, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 4.776577792777591, |
|
"grad_norm": 9.30246639251709, |
|
"learning_rate": 1.6227781312884388e-05, |
|
"loss": 0.0073, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.794577567780403, |
|
"grad_norm": 8.216259002685547, |
|
"learning_rate": 1.6099792383258664e-05, |
|
"loss": 0.0071, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 4.812577342783215, |
|
"grad_norm": 10.296393394470215, |
|
"learning_rate": 1.5971969279524668e-05, |
|
"loss": 0.0075, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 5.01673208014873, |
|
"grad_norm": 6.276814937591553, |
|
"learning_rate": 1.584431743636237e-05, |
|
"loss": 0.0059, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 5.038766095159402, |
|
"grad_norm": 6.1525068283081055, |
|
"learning_rate": 1.5716842281170205e-05, |
|
"loss": 0.0059, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 5.060800110170075, |
|
"grad_norm": 5.246829032897949, |
|
"learning_rate": 1.558954923383432e-05, |
|
"loss": 0.0057, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.082834125180748, |
|
"grad_norm": 8.341444969177246, |
|
"learning_rate": 1.5462443706498178e-05, |
|
"loss": 0.0061, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 5.104868140191421, |
|
"grad_norm": 8.144927024841309, |
|
"learning_rate": 1.533553110333239e-05, |
|
"loss": 0.0058, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 5.126902155202093, |
|
"grad_norm": 4.849141597747803, |
|
"learning_rate": 1.5208816820304973e-05, |
|
"loss": 0.0055, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 5.148936170212766, |
|
"grad_norm": 5.376830577850342, |
|
"learning_rate": 1.5082306244951956e-05, |
|
"loss": 0.0052, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 5.170970185223439, |
|
"grad_norm": 5.531591892242432, |
|
"learning_rate": 1.495600475614825e-05, |
|
"loss": 0.0059, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 5.193004200234111, |
|
"grad_norm": 4.80387020111084, |
|
"learning_rate": 1.4829917723879029e-05, |
|
"loss": 0.0056, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 5.2150382152447845, |
|
"grad_norm": 6.294495582580566, |
|
"learning_rate": 1.4704050509011345e-05, |
|
"loss": 0.0056, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 5.237072230255457, |
|
"grad_norm": 5.95064640045166, |
|
"learning_rate": 1.4578408463066246e-05, |
|
"loss": 0.0058, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 5.259106245266129, |
|
"grad_norm": 6.749286651611328, |
|
"learning_rate": 1.4452996927991236e-05, |
|
"loss": 0.006, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 5.2811402602768025, |
|
"grad_norm": 6.431356906890869, |
|
"learning_rate": 1.4327821235933126e-05, |
|
"loss": 0.0062, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.303174275287475, |
|
"grad_norm": 5.065823554992676, |
|
"learning_rate": 1.4202886709011357e-05, |
|
"loss": 0.005, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 5.325208290298148, |
|
"grad_norm": 4.399689674377441, |
|
"learning_rate": 1.4078198659091686e-05, |
|
"loss": 0.006, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 5.3472423053088205, |
|
"grad_norm": 4.889394283294678, |
|
"learning_rate": 1.3953762387560392e-05, |
|
"loss": 0.0054, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 5.369276320319493, |
|
"grad_norm": 6.588573455810547, |
|
"learning_rate": 1.3829583185098802e-05, |
|
"loss": 0.0056, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 5.391310335330166, |
|
"grad_norm": 4.880826473236084, |
|
"learning_rate": 1.3705666331458424e-05, |
|
"loss": 0.0052, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 5.4133443503408385, |
|
"grad_norm": 5.972387313842773, |
|
"learning_rate": 1.3582017095236413e-05, |
|
"loss": 0.0052, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 5.435378365351512, |
|
"grad_norm": 5.3322224617004395, |
|
"learning_rate": 1.345864073365157e-05, |
|
"loss": 0.0054, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 5.457412380362184, |
|
"grad_norm": 4.680153846740723, |
|
"learning_rate": 1.3335542492320856e-05, |
|
"loss": 0.0059, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 5.4794463953728565, |
|
"grad_norm": 5.0644636154174805, |
|
"learning_rate": 1.3212727605036319e-05, |
|
"loss": 0.0055, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 5.50148041038353, |
|
"grad_norm": 6.729560375213623, |
|
"learning_rate": 1.3090201293542597e-05, |
|
"loss": 0.0061, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 5.523514425394202, |
|
"grad_norm": 6.099545001983643, |
|
"learning_rate": 1.2967968767314898e-05, |
|
"loss": 0.0063, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 5.545548440404875, |
|
"grad_norm": 4.657865524291992, |
|
"learning_rate": 1.284603522333749e-05, |
|
"loss": 0.0052, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 5.567582455415548, |
|
"grad_norm": 5.916351795196533, |
|
"learning_rate": 1.2724405845882775e-05, |
|
"loss": 0.0056, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 5.58961647042622, |
|
"grad_norm": 6.424000263214111, |
|
"learning_rate": 1.2603085806290824e-05, |
|
"loss": 0.0065, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 5.611650485436893, |
|
"grad_norm": 7.819843769073486, |
|
"learning_rate": 1.2482080262749538e-05, |
|
"loss": 0.0057, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 5.633684500447566, |
|
"grad_norm": 6.704712867736816, |
|
"learning_rate": 1.2361394360075348e-05, |
|
"loss": 0.0052, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 5.655718515458239, |
|
"grad_norm": 5.237440586090088, |
|
"learning_rate": 1.224103322949442e-05, |
|
"loss": 0.0052, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 5.677752530468911, |
|
"grad_norm": 6.460971355438232, |
|
"learning_rate": 1.2121001988424541e-05, |
|
"loss": 0.0057, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 5.699786545479584, |
|
"grad_norm": 5.491466522216797, |
|
"learning_rate": 1.2001305740257505e-05, |
|
"loss": 0.0051, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 5.721820560490257, |
|
"grad_norm": 5.925656318664551, |
|
"learning_rate": 1.188194957414217e-05, |
|
"loss": 0.0054, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 5.743854575500929, |
|
"grad_norm": 6.257553577423096, |
|
"learning_rate": 1.176293856476804e-05, |
|
"loss": 0.0053, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 5.765888590511603, |
|
"grad_norm": 4.400306224822998, |
|
"learning_rate": 1.1644277772149531e-05, |
|
"loss": 0.0051, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 5.787922605522275, |
|
"grad_norm": 6.251142978668213, |
|
"learning_rate": 1.1525972241410827e-05, |
|
"loss": 0.0052, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 5.809956620532947, |
|
"grad_norm": 4.689172267913818, |
|
"learning_rate": 1.1408027002571359e-05, |
|
"loss": 0.0057, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 5.831990635543621, |
|
"grad_norm": 6.150318145751953, |
|
"learning_rate": 1.1290447070331958e-05, |
|
"loss": 0.0053, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 5.854024650554293, |
|
"grad_norm": 5.218411445617676, |
|
"learning_rate": 1.1173237443861678e-05, |
|
"loss": 0.0057, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 5.876058665564966, |
|
"grad_norm": 4.724580764770508, |
|
"learning_rate": 1.1056403106585156e-05, |
|
"loss": 0.005, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 5.898092680575639, |
|
"grad_norm": 3.99684476852417, |
|
"learning_rate": 1.093994902597082e-05, |
|
"loss": 0.0053, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 5.920126695586311, |
|
"grad_norm": 5.558387756347656, |
|
"learning_rate": 1.0823880153319642e-05, |
|
"loss": 0.0051, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 5.942160710596984, |
|
"grad_norm": 5.6572957038879395, |
|
"learning_rate": 1.0708201423554634e-05, |
|
"loss": 0.0055, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 5.964194725607657, |
|
"grad_norm": 3.7635843753814697, |
|
"learning_rate": 1.059291775501102e-05, |
|
"loss": 0.0056, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 5.98622874061833, |
|
"grad_norm": 5.005063056945801, |
|
"learning_rate": 1.0478034049227137e-05, |
|
"loss": 0.0054, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 6.008149523300085, |
|
"grad_norm": 12.51547908782959, |
|
"learning_rate": 1.036355519073602e-05, |
|
"loss": 0.0065, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 6.028287341493345, |
|
"grad_norm": 3.451265335083008, |
|
"learning_rate": 1.0249486046857735e-05, |
|
"loss": 0.0051, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 6.048425159686605, |
|
"grad_norm": 7.321857929229736, |
|
"learning_rate": 1.0135831467492432e-05, |
|
"loss": 0.0059, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 6.068562977879865, |
|
"grad_norm": 4.42759895324707, |
|
"learning_rate": 1.0022596284914138e-05, |
|
"loss": 0.006, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 6.088700796073126, |
|
"grad_norm": 3.1307969093322754, |
|
"learning_rate": 9.90978531356531e-06, |
|
"loss": 0.005, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 6.108838614266386, |
|
"grad_norm": 5.168570518493652, |
|
"learning_rate": 9.797403349852126e-06, |
|
"loss": 0.0044, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 6.128976432459646, |
|
"grad_norm": 6.52720832824707, |
|
"learning_rate": 9.685455171940567e-06, |
|
"loss": 0.005, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 6.1491142506529055, |
|
"grad_norm": 4.172718048095703, |
|
"learning_rate": 9.573945539553258e-06, |
|
"loss": 0.0044, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 6.169252068846166, |
|
"grad_norm": 8.326397895812988, |
|
"learning_rate": 9.462879193767092e-06, |
|
"loss": 0.0053, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 6.189389887039426, |
|
"grad_norm": 4.124663352966309, |
|
"learning_rate": 9.352260856811667e-06, |
|
"loss": 0.0058, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 6.209527705232686, |
|
"grad_norm": 4.169667720794678, |
|
"learning_rate": 9.2420952318685e-06, |
|
"loss": 0.0049, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 6.229665523425946, |
|
"grad_norm": 5.089596271514893, |
|
"learning_rate": 9.132387002871057e-06, |
|
"loss": 0.0044, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 6.249803341619207, |
|
"grad_norm": 6.561634540557861, |
|
"learning_rate": 9.023140834305621e-06, |
|
"loss": 0.0051, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 6.269941159812467, |
|
"grad_norm": 3.9571590423583984, |
|
"learning_rate": 8.914361371012939e-06, |
|
"loss": 0.0045, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 6.2900789780057265, |
|
"grad_norm": 5.203815460205078, |
|
"learning_rate": 8.806053237990788e-06, |
|
"loss": 0.0065, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 6.310216796198986, |
|
"grad_norm": 4.430067539215088, |
|
"learning_rate": 8.698221040197288e-06, |
|
"loss": 0.0047, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 6.330354614392247, |
|
"grad_norm": 6.157893180847168, |
|
"learning_rate": 8.590869362355128e-06, |
|
"loss": 0.0063, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 6.350492432585507, |
|
"grad_norm": 4.458155155181885, |
|
"learning_rate": 8.484002768756643e-06, |
|
"loss": 0.0048, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 6.370630250778767, |
|
"grad_norm": 3.2868919372558594, |
|
"learning_rate": 8.37762580306972e-06, |
|
"loss": 0.0042, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 6.390768068972028, |
|
"grad_norm": 4.93739652633667, |
|
"learning_rate": 8.271742988144688e-06, |
|
"loss": 0.0051, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 6.4109058871652875, |
|
"grad_norm": 3.718449115753174, |
|
"learning_rate": 8.166358825821923e-06, |
|
"loss": 0.0048, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 6.431043705358547, |
|
"grad_norm": 3.594763994216919, |
|
"learning_rate": 8.061477796740511e-06, |
|
"loss": 0.0054, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 6.451181523551807, |
|
"grad_norm": 6.66500997543335, |
|
"learning_rate": 7.957104360147746e-06, |
|
"loss": 0.0046, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 6.471319341745068, |
|
"grad_norm": 3.4088094234466553, |
|
"learning_rate": 7.853242953709467e-06, |
|
"loss": 0.006, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 6.491457159938328, |
|
"grad_norm": 3.0382471084594727, |
|
"learning_rate": 7.74989799332146e-06, |
|
"loss": 0.0051, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 6.511594978131588, |
|
"grad_norm": 3.609813928604126, |
|
"learning_rate": 7.64707387292166e-06, |
|
"loss": 0.005, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 6.531732796324848, |
|
"grad_norm": 4.544133186340332, |
|
"learning_rate": 7.544774964303341e-06, |
|
"loss": 0.005, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 6.5518706145181085, |
|
"grad_norm": 3.8849527835845947, |
|
"learning_rate": 7.443005616929277e-06, |
|
"loss": 0.0045, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 6.572008432711368, |
|
"grad_norm": 4.574479579925537, |
|
"learning_rate": 7.341770157746737e-06, |
|
"loss": 0.0047, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 6.592146250904628, |
|
"grad_norm": 3.9820139408111572, |
|
"learning_rate": 7.241072891003589e-06, |
|
"loss": 0.005, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 6.612284069097889, |
|
"grad_norm": 3.3841769695281982, |
|
"learning_rate": 7.1409180980652596e-06, |
|
"loss": 0.0039, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 6.632421887291149, |
|
"grad_norm": 3.9114112854003906, |
|
"learning_rate": 7.041310037232712e-06, |
|
"loss": 0.0047, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 6.652559705484409, |
|
"grad_norm": 10.5076904296875, |
|
"learning_rate": 6.942252943561396e-06, |
|
"loss": 0.0051, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 6.672697523677669, |
|
"grad_norm": 3.7937240600585938, |
|
"learning_rate": 6.843751028681178e-06, |
|
"loss": 0.0041, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 6.692835341870929, |
|
"grad_norm": 5.625157356262207, |
|
"learning_rate": 6.74580848061728e-06, |
|
"loss": 0.0044, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 6.712973160064189, |
|
"grad_norm": 3.2087152004241943, |
|
"learning_rate": 6.648429463612218e-06, |
|
"loss": 0.0066, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 6.733110978257449, |
|
"grad_norm": 3.721176862716675, |
|
"learning_rate": 6.551618117948746e-06, |
|
"loss": 0.0044, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 6.753248796450709, |
|
"grad_norm": 3.429137945175171, |
|
"learning_rate": 6.4553785597738195e-06, |
|
"loss": 0.0048, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 6.77338661464397, |
|
"grad_norm": 3.475482225418091, |
|
"learning_rate": 6.359714880923602e-06, |
|
"loss": 0.006, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 6.79352443283723, |
|
"grad_norm": 4.675537586212158, |
|
"learning_rate": 6.2646311487494785e-06, |
|
"loss": 0.0044, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 6.81366225103049, |
|
"grad_norm": 6.543276786804199, |
|
"learning_rate": 6.170131405945125e-06, |
|
"loss": 0.0049, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 6.83380006922375, |
|
"grad_norm": 3.8348066806793213, |
|
"learning_rate": 6.0762196703746324e-06, |
|
"loss": 0.0049, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 6.85393788741701, |
|
"grad_norm": 3.8558757305145264, |
|
"learning_rate": 5.982899934901667e-06, |
|
"loss": 0.0042, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 6.87407570561027, |
|
"grad_norm": 3.2190654277801514, |
|
"learning_rate": 5.8901761672197165e-06, |
|
"loss": 0.0039, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 6.89421352380353, |
|
"grad_norm": 3.8839619159698486, |
|
"learning_rate": 5.798052309683384e-06, |
|
"loss": 0.005, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 6.914351341996791, |
|
"grad_norm": 3.031508684158325, |
|
"learning_rate": 5.706532279140782e-06, |
|
"loss": 0.0048, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 6.934489160190051, |
|
"grad_norm": 4.985992431640625, |
|
"learning_rate": 5.61561996676699e-06, |
|
"loss": 0.0059, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 6.954626978383311, |
|
"grad_norm": 4.688712120056152, |
|
"learning_rate": 5.5253192378985966e-06, |
|
"loss": 0.0043, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 6.9747647965765704, |
|
"grad_norm": 5.569628715515137, |
|
"learning_rate": 5.43563393186941e-06, |
|
"loss": 0.0043, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 6.994902614769831, |
|
"grad_norm": 2.9404146671295166, |
|
"learning_rate": 5.346567861847168e-06, |
|
"loss": 0.0045, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 7.01372683596431, |
|
"grad_norm": 2.525343656539917, |
|
"learning_rate": 5.258124814671403e-06, |
|
"loss": 0.007, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 7.0320292839167235, |
|
"grad_norm": 6.035243988037109, |
|
"learning_rate": 5.1703085506925225e-06, |
|
"loss": 0.0087, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 7.050331731869138, |
|
"grad_norm": 3.9128825664520264, |
|
"learning_rate": 5.083122803611802e-06, |
|
"loss": 0.0065, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 7.068634179821551, |
|
"grad_norm": 5.710306644439697, |
|
"learning_rate": 4.996571280322762e-06, |
|
"loss": 0.0116, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 7.086936627773965, |
|
"grad_norm": 9.083086967468262, |
|
"learning_rate": 4.910657660753482e-06, |
|
"loss": 0.0094, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 7.105239075726378, |
|
"grad_norm": 4.0207672119140625, |
|
"learning_rate": 4.825385597710148e-06, |
|
"loss": 0.0085, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 7.123541523678792, |
|
"grad_norm": 3.8497507572174072, |
|
"learning_rate": 4.740758716721803e-06, |
|
"loss": 0.0083, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 7.141843971631205, |
|
"grad_norm": 5.6043009757995605, |
|
"learning_rate": 4.6567806158861164e-06, |
|
"loss": 0.0054, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 7.1601464195836195, |
|
"grad_norm": 3.8586933612823486, |
|
"learning_rate": 4.573454865716465e-06, |
|
"loss": 0.0068, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 7.178448867536033, |
|
"grad_norm": 4.219987392425537, |
|
"learning_rate": 4.490785008990113e-06, |
|
"loss": 0.0084, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 7.196751315488447, |
|
"grad_norm": 4.6731109619140625, |
|
"learning_rate": 4.408774560597544e-06, |
|
"loss": 0.0068, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 7.21505376344086, |
|
"grad_norm": 2.894176483154297, |
|
"learning_rate": 4.32742700739309e-06, |
|
"loss": 0.007, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 7.233356211393274, |
|
"grad_norm": 3.3003957271575928, |
|
"learning_rate": 4.246745808046599e-06, |
|
"loss": 0.0078, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 7.251658659345687, |
|
"grad_norm": 3.965242862701416, |
|
"learning_rate": 4.166734392896438e-06, |
|
"loss": 0.0054, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 7.269961107298101, |
|
"grad_norm": 3.6206185817718506, |
|
"learning_rate": 4.087396163803645e-06, |
|
"loss": 0.0066, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 7.288263555250515, |
|
"grad_norm": 3.3707611560821533, |
|
"learning_rate": 4.008734494007241e-06, |
|
"loss": 0.0084, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 7.306566003202929, |
|
"grad_norm": 3.408390522003174, |
|
"learning_rate": 3.9307527279808665e-06, |
|
"loss": 0.0045, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 7.324868451155342, |
|
"grad_norm": 3.1554362773895264, |
|
"learning_rate": 3.85345418129055e-06, |
|
"loss": 0.0084, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 7.343170899107756, |
|
"grad_norm": 3.7730562686920166, |
|
"learning_rate": 3.776842140453756e-06, |
|
"loss": 0.0056, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 7.361473347060169, |
|
"grad_norm": 2.509883165359497, |
|
"learning_rate": 3.700919862799639e-06, |
|
"loss": 0.0077, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 7.379775795012583, |
|
"grad_norm": 4.287370681762695, |
|
"learning_rate": 3.6256905763305605e-06, |
|
"loss": 0.0067, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 7.3980782429649965, |
|
"grad_norm": 6.043769359588623, |
|
"learning_rate": 3.5511574795848415e-06, |
|
"loss": 0.0051, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 7.416380690917411, |
|
"grad_norm": 11.113882064819336, |
|
"learning_rate": 3.4773237415007644e-06, |
|
"loss": 0.0077, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 7.434683138869824, |
|
"grad_norm": 4.403820037841797, |
|
"learning_rate": 3.4041925012818423e-06, |
|
"loss": 0.0061, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 7.452985586822238, |
|
"grad_norm": 3.961599826812744, |
|
"learning_rate": 3.3317668682633532e-06, |
|
"loss": 0.0081, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 7.471288034774651, |
|
"grad_norm": 2.8031773567199707, |
|
"learning_rate": 3.2600499217801307e-06, |
|
"loss": 0.0083, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 7.489590482727065, |
|
"grad_norm": 2.444967269897461, |
|
"learning_rate": 3.189044711035645e-06, |
|
"loss": 0.0082, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 7.507892930679478, |
|
"grad_norm": 2.957968235015869, |
|
"learning_rate": 3.1187542549723625e-06, |
|
"loss": 0.0083, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 7.526195378631892, |
|
"grad_norm": 4.196249961853027, |
|
"learning_rate": 3.0491815421433825e-06, |
|
"loss": 0.0053, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 7.544497826584306, |
|
"grad_norm": 4.068223476409912, |
|
"learning_rate": 2.980329530585362e-06, |
|
"loss": 0.0048, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 7.56280027453672, |
|
"grad_norm": 10.506719589233398, |
|
"learning_rate": 2.912201147692786e-06, |
|
"loss": 0.0053, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 7.581102722489133, |
|
"grad_norm": 3.4478495121002197, |
|
"learning_rate": 2.8447992900934583e-06, |
|
"loss": 0.0064, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 7.599405170441546, |
|
"grad_norm": 3.022067070007324, |
|
"learning_rate": 2.778126823525373e-06, |
|
"loss": 0.0045, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 7.61770761839396, |
|
"grad_norm": 2.931964874267578, |
|
"learning_rate": 2.712186582714862e-06, |
|
"loss": 0.0074, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 7.6360100663463735, |
|
"grad_norm": 5.001142978668213, |
|
"learning_rate": 2.6469813712560544e-06, |
|
"loss": 0.005, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 7.654312514298788, |
|
"grad_norm": 4.039334297180176, |
|
"learning_rate": 2.5825139614917238e-06, |
|
"loss": 0.0054, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 7.672614962251201, |
|
"grad_norm": 2.894651412963867, |
|
"learning_rate": 2.518787094395363e-06, |
|
"loss": 0.0051, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 7.690917410203615, |
|
"grad_norm": 3.445218801498413, |
|
"learning_rate": 2.455803479454664e-06, |
|
"loss": 0.0077, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 7.709219858156028, |
|
"grad_norm": 5.986388683319092, |
|
"learning_rate": 2.3935657945563427e-06, |
|
"loss": 0.0051, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 7.727522306108442, |
|
"grad_norm": 4.5863237380981445, |
|
"learning_rate": 2.332076685872231e-06, |
|
"loss": 0.0062, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 7.745824754060855, |
|
"grad_norm": 3.8240745067596436, |
|
"learning_rate": 2.2713387677468267e-06, |
|
"loss": 0.0066, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 7.7641272020132694, |
|
"grad_norm": 3.147395372390747, |
|
"learning_rate": 2.2113546225861037e-06, |
|
"loss": 0.0067, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 7.782429649965683, |
|
"grad_norm": 4.106767177581787, |
|
"learning_rate": 2.1521268007477047e-06, |
|
"loss": 0.008, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 7.800732097918097, |
|
"grad_norm": 3.5174560546875, |
|
"learning_rate": 2.0936578204325575e-06, |
|
"loss": 0.008, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 7.81903454587051, |
|
"grad_norm": 3.5646681785583496, |
|
"learning_rate": 2.035950167577747e-06, |
|
"loss": 0.0062, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 7.837336993822924, |
|
"grad_norm": 3.414524555206299, |
|
"learning_rate": 1.9790062957508626e-06, |
|
"loss": 0.0074, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 7.855639441775337, |
|
"grad_norm": 4.365599632263184, |
|
"learning_rate": 1.9228286260456673e-06, |
|
"loss": 0.0102, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 7.873941889727751, |
|
"grad_norm": 6.737311840057373, |
|
"learning_rate": 1.8674195469791524e-06, |
|
"loss": 0.006, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 7.8922443376801645, |
|
"grad_norm": 8.64922046661377, |
|
"learning_rate": 1.8127814143900012e-06, |
|
"loss": 0.0061, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 7.910546785632579, |
|
"grad_norm": 3.7279410362243652, |
|
"learning_rate": 1.7589165513383988e-06, |
|
"loss": 0.0062, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 7.928849233584992, |
|
"grad_norm": 5.265659332275391, |
|
"learning_rate": 1.7058272480072879e-06, |
|
"loss": 0.0063, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 7.947151681537406, |
|
"grad_norm": 3.1222264766693115, |
|
"learning_rate": 1.6535157616049867e-06, |
|
"loss": 0.0058, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 7.965454129489819, |
|
"grad_norm": 3.9116389751434326, |
|
"learning_rate": 1.601984316269214e-06, |
|
"loss": 0.0066, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 7.983756577442233, |
|
"grad_norm": 8.46506118774414, |
|
"learning_rate": 1.5512351029725325e-06, |
|
"loss": 0.0052, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 7.036990501080565, |
|
"grad_norm": 3.518129825592041, |
|
"learning_rate": 1.5012702794291901e-06, |
|
"loss": 0.0049, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 7.053073327637332, |
|
"grad_norm": 5.544999122619629, |
|
"learning_rate": 1.4520919700033864e-06, |
|
"loss": 0.0054, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 7.0691561541941, |
|
"grad_norm": 2.621429681777954, |
|
"learning_rate": 1.4037022656189425e-06, |
|
"loss": 0.0071, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 7.085238980750867, |
|
"grad_norm": 6.043819427490234, |
|
"learning_rate": 1.356103223670402e-06, |
|
"loss": 0.0073, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 7.1013218073076345, |
|
"grad_norm": 3.584305763244629, |
|
"learning_rate": 1.3092968679355634e-06, |
|
"loss": 0.0045, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 7.1174046338644015, |
|
"grad_norm": 6.561378479003906, |
|
"learning_rate": 1.2632851884894293e-06, |
|
"loss": 0.0091, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 7.133487460421169, |
|
"grad_norm": 15.719043731689453, |
|
"learning_rate": 1.2180701416195894e-06, |
|
"loss": 0.0155, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 7.149570286977936, |
|
"grad_norm": 7.752620697021484, |
|
"learning_rate": 1.1736536497430584e-06, |
|
"loss": 0.0098, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 7.165653113534704, |
|
"grad_norm": 7.116891384124756, |
|
"learning_rate": 1.1300376013245272e-06, |
|
"loss": 0.0107, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 7.181735940091471, |
|
"grad_norm": 4.907498836517334, |
|
"learning_rate": 1.0872238507960753e-06, |
|
"loss": 0.0087, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 7.197818766648238, |
|
"grad_norm": 3.2054624557495117, |
|
"learning_rate": 1.0452142184783232e-06, |
|
"loss": 0.0091, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 7.213901593205006, |
|
"grad_norm": 4.177403926849365, |
|
"learning_rate": 1.0040104905030467e-06, |
|
"loss": 0.0064, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 7.229984419761773, |
|
"grad_norm": 5.655606269836426, |
|
"learning_rate": 9.63614418737222e-07, |
|
"loss": 0.0107, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 7.246067246318541, |
|
"grad_norm": 2.246121883392334, |
|
"learning_rate": 9.240277207085557e-07, |
|
"loss": 0.008, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 7.262150072875308, |
|
"grad_norm": 5.548453330993652, |
|
"learning_rate": 8.852520795324349e-07, |
|
"loss": 0.0074, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 7.2782328994320755, |
|
"grad_norm": 12.830928802490234, |
|
"learning_rate": 8.472891438404108e-07, |
|
"loss": 0.0123, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 7.2943157259888425, |
|
"grad_norm": 3.613041400909424, |
|
"learning_rate": 8.101405277100549e-07, |
|
"loss": 0.0118, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 7.31039855254561, |
|
"grad_norm": 3.5919158458709717, |
|
"learning_rate": 7.738078105963565e-07, |
|
"loss": 0.0058, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 7.326481379102377, |
|
"grad_norm": 6.3182196617126465, |
|
"learning_rate": 7.3829253726458e-07, |
|
"loss": 0.0106, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 7.342564205659144, |
|
"grad_norm": 4.684800624847412, |
|
"learning_rate": 7.035962177245536e-07, |
|
"loss": 0.0065, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 7.358647032215912, |
|
"grad_norm": 2.9966583251953125, |
|
"learning_rate": 6.697203271665054e-07, |
|
"loss": 0.0081, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 7.374729858772679, |
|
"grad_norm": 3.141700506210327, |
|
"learning_rate": 6.366663058983102e-07, |
|
"loss": 0.009, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 7.390812685329447, |
|
"grad_norm": 4.013637542724609, |
|
"learning_rate": 6.044355592842644e-07, |
|
"loss": 0.0087, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 7.406895511886214, |
|
"grad_norm": 3.2047641277313232, |
|
"learning_rate": 5.730294576853501e-07, |
|
"loss": 0.007, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 7.422978338442982, |
|
"grad_norm": 6.322583198547363, |
|
"learning_rate": 5.424493364009364e-07, |
|
"loss": 0.0066, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 7.439061164999749, |
|
"grad_norm": 7.329522132873535, |
|
"learning_rate": 5.126964956120351e-07, |
|
"loss": 0.0095, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 7.4551439915565165, |
|
"grad_norm": 3.080005168914795, |
|
"learning_rate": 4.837722003260136e-07, |
|
"loss": 0.0091, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 7.4712268181132835, |
|
"grad_norm": 4.727446556091309, |
|
"learning_rate": 4.5567768032280136e-07, |
|
"loss": 0.0077, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 7.4873096446700504, |
|
"grad_norm": 2.2113332748413086, |
|
"learning_rate": 4.2841413010261456e-07, |
|
"loss": 0.0066, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 7.503392471226818, |
|
"grad_norm": 11.121126174926758, |
|
"learning_rate": 4.01982708835158e-07, |
|
"loss": 0.0066, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 7.519475297783585, |
|
"grad_norm": 4.20743465423584, |
|
"learning_rate": 3.7638454031035276e-07, |
|
"loss": 0.0111, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 7.535558124340353, |
|
"grad_norm": 6.002978324890137, |
|
"learning_rate": 3.5162071289055245e-07, |
|
"loss": 0.0066, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 7.55164095089712, |
|
"grad_norm": 3.0463833808898926, |
|
"learning_rate": 3.276922794642534e-07, |
|
"loss": 0.0072, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 7.567723777453887, |
|
"grad_norm": 6.407285213470459, |
|
"learning_rate": 3.046002574013551e-07, |
|
"loss": 0.0072, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 7.583806604010655, |
|
"grad_norm": 8.836779594421387, |
|
"learning_rate": 2.8234562850988356e-07, |
|
"loss": 0.0079, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 7.599889430567423, |
|
"grad_norm": 5.871425628662109, |
|
"learning_rate": 2.609293389942602e-07, |
|
"loss": 0.0077, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 7.61597225712419, |
|
"grad_norm": 6.142898082733154, |
|
"learning_rate": 2.403522994150609e-07, |
|
"loss": 0.0071, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 7.632055083680957, |
|
"grad_norm": 3.5649702548980713, |
|
"learning_rate": 2.2061538465031117e-07, |
|
"loss": 0.0071, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 7.6481379102377245, |
|
"grad_norm": 7.242725372314453, |
|
"learning_rate": 2.017194338582873e-07, |
|
"loss": 0.0105, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 7.6642207367944915, |
|
"grad_norm": 12.46696662902832, |
|
"learning_rate": 1.8366525044183126e-07, |
|
"loss": 0.0095, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 7.680303563351259, |
|
"grad_norm": 5.7213640213012695, |
|
"learning_rate": 1.6645360201420046e-07, |
|
"loss": 0.0095, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 7.696386389908026, |
|
"grad_norm": 3.3008534908294678, |
|
"learning_rate": 1.5008522036642048e-07, |
|
"loss": 0.0078, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 7.712469216464793, |
|
"grad_norm": 4.01139497756958, |
|
"learning_rate": 1.3456080143618767e-07, |
|
"loss": 0.0097, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 7.728552043021561, |
|
"grad_norm": 4.135232448577881, |
|
"learning_rate": 1.198810052782595e-07, |
|
"loss": 0.0091, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 7.744634869578328, |
|
"grad_norm": 6.216122150421143, |
|
"learning_rate": 1.060464560364105e-07, |
|
"loss": 0.0071, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 7.760717696135096, |
|
"grad_norm": 3.039104461669922, |
|
"learning_rate": 9.305774191687988e-08, |
|
"loss": 0.009, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 7.776800522691863, |
|
"grad_norm": 2.9482333660125732, |
|
"learning_rate": 8.091541516337398e-08, |
|
"loss": 0.0104, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 7.792883349248631, |
|
"grad_norm": 8.094056129455566, |
|
"learning_rate": 6.961999203357605e-08, |
|
"loss": 0.008, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 7.808966175805398, |
|
"grad_norm": 2.1247482299804688, |
|
"learning_rate": 5.917195277721055e-08, |
|
"loss": 0.0046, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 7.8250490023621655, |
|
"grad_norm": 7.984273433685303, |
|
"learning_rate": 4.957174161560607e-08, |
|
"loss": 0.0109, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 7.8411318289189325, |
|
"grad_norm": 5.030553817749023, |
|
"learning_rate": 4.0819766722826057e-08, |
|
"loss": 0.0062, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 7.857214655475699, |
|
"grad_norm": 2.809941291809082, |
|
"learning_rate": 3.291640020829823e-08, |
|
"loss": 0.0081, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 7.873297482032467, |
|
"grad_norm": 6.414947032928467, |
|
"learning_rate": 2.5861978101009433e-08, |
|
"loss": 0.0075, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 7.889380308589234, |
|
"grad_norm": 5.527952671051025, |
|
"learning_rate": 1.9656800335206004e-08, |
|
"loss": 0.0058, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 7.905463135146002, |
|
"grad_norm": 6.868896007537842, |
|
"learning_rate": 1.4301130737646163e-08, |
|
"loss": 0.0067, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 7.921545961702769, |
|
"grad_norm": 11.053786277770996, |
|
"learning_rate": 9.795197016384538e-09, |
|
"loss": 0.0102, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 7.937628788259537, |
|
"grad_norm": 5.266638278961182, |
|
"learning_rate": 6.1391907510888195e-09, |
|
"loss": 0.0095, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 7.953711614816304, |
|
"grad_norm": 3.4717257022857666, |
|
"learning_rate": 3.3332673848951448e-09, |
|
"loss": 0.0065, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 7.969794441373072, |
|
"grad_norm": 4.068334579467773, |
|
"learning_rate": 1.3775462177956222e-09, |
|
"loss": 0.0083, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 7.985877267929839, |
|
"grad_norm": 3.321983814239502, |
|
"learning_rate": 2.721104015712683e-10, |
|
"loss": 0.0062, |
|
"step": 4960 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4968, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.935941540711301e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|