|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.05337247314697445, |
|
"eval_steps": 7, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005337247314697444, |
|
"grad_norm": 0.3629896640777588, |
|
"learning_rate": 2e-05, |
|
"loss": 1.3023, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0005337247314697444, |
|
"eval_loss": 1.6696527004241943, |
|
"eval_runtime": 138.2798, |
|
"eval_samples_per_second": 11.412, |
|
"eval_steps_per_second": 5.706, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0010674494629394889, |
|
"grad_norm": 0.37523019313812256, |
|
"learning_rate": 4e-05, |
|
"loss": 1.3966, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0016011741944092334, |
|
"grad_norm": 0.39908841252326965, |
|
"learning_rate": 6e-05, |
|
"loss": 1.2988, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0021348989258789777, |
|
"grad_norm": 0.44918331503868103, |
|
"learning_rate": 8e-05, |
|
"loss": 1.8351, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0026686236573487225, |
|
"grad_norm": 0.4540751874446869, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6159, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.003202348388818467, |
|
"grad_norm": 0.48438403010368347, |
|
"learning_rate": 0.00012, |
|
"loss": 1.7997, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.003736073120288211, |
|
"grad_norm": 0.47967973351478577, |
|
"learning_rate": 0.00014, |
|
"loss": 1.848, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.003736073120288211, |
|
"eval_loss": 1.5880990028381348, |
|
"eval_runtime": 136.8431, |
|
"eval_samples_per_second": 11.531, |
|
"eval_steps_per_second": 5.766, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0042697978517579555, |
|
"grad_norm": 0.443247526884079, |
|
"learning_rate": 0.00016, |
|
"loss": 1.3893, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.004803522583227701, |
|
"grad_norm": 0.4176405370235443, |
|
"learning_rate": 0.00018, |
|
"loss": 1.6137, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.005337247314697445, |
|
"grad_norm": 0.42031896114349365, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5092, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005870972046167189, |
|
"grad_norm": 0.3552657961845398, |
|
"learning_rate": 0.0001999390827019096, |
|
"loss": 1.4142, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.006404696777636934, |
|
"grad_norm": 0.5527915358543396, |
|
"learning_rate": 0.00019975640502598244, |
|
"loss": 1.6114, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.006938421509106678, |
|
"grad_norm": 0.4900396168231964, |
|
"learning_rate": 0.00019945218953682734, |
|
"loss": 1.3065, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.007472146240576422, |
|
"grad_norm": 0.41337817907333374, |
|
"learning_rate": 0.00019902680687415705, |
|
"loss": 1.3956, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.007472146240576422, |
|
"eval_loss": 1.3877164125442505, |
|
"eval_runtime": 132.633, |
|
"eval_samples_per_second": 11.897, |
|
"eval_steps_per_second": 5.949, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.008005870972046168, |
|
"grad_norm": 0.44572773575782776, |
|
"learning_rate": 0.00019848077530122083, |
|
"loss": 1.5852, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.008539595703515911, |
|
"grad_norm": 0.413870632648468, |
|
"learning_rate": 0.00019781476007338058, |
|
"loss": 1.2316, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.009073320434985656, |
|
"grad_norm": 0.41110166907310486, |
|
"learning_rate": 0.00019702957262759965, |
|
"loss": 1.4972, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.009607045166455401, |
|
"grad_norm": 0.3463555872440338, |
|
"learning_rate": 0.0001961261695938319, |
|
"loss": 1.1697, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.010140769897925145, |
|
"grad_norm": 0.3517301678657532, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 1.2792, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.01067449462939489, |
|
"grad_norm": 0.3800961375236511, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 1.3721, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.011208219360864633, |
|
"grad_norm": 0.46215319633483887, |
|
"learning_rate": 0.00019271838545667876, |
|
"loss": 1.2391, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.011208219360864633, |
|
"eval_loss": 1.3272384405136108, |
|
"eval_runtime": 142.2392, |
|
"eval_samples_per_second": 11.094, |
|
"eval_steps_per_second": 5.547, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.011741944092334379, |
|
"grad_norm": 0.4401664733886719, |
|
"learning_rate": 0.0001913545457642601, |
|
"loss": 1.279, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.012275668823804124, |
|
"grad_norm": 0.4027818441390991, |
|
"learning_rate": 0.0001898794046299167, |
|
"loss": 1.3049, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.012809393555273867, |
|
"grad_norm": 0.4447319209575653, |
|
"learning_rate": 0.00018829475928589271, |
|
"loss": 1.5177, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.013343118286743613, |
|
"grad_norm": 0.4240773618221283, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 1.2123, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.013876843018213356, |
|
"grad_norm": 0.39182350039482117, |
|
"learning_rate": 0.0001848048096156426, |
|
"loss": 1.2975, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.014410567749683101, |
|
"grad_norm": 0.3561626374721527, |
|
"learning_rate": 0.00018290375725550417, |
|
"loss": 1.3176, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.014944292481152845, |
|
"grad_norm": 0.41868481040000916, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 1.497, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.014944292481152845, |
|
"eval_loss": 1.2778455018997192, |
|
"eval_runtime": 133.835, |
|
"eval_samples_per_second": 11.791, |
|
"eval_steps_per_second": 5.895, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.01547801721262259, |
|
"grad_norm": 0.39559242129325867, |
|
"learning_rate": 0.00017880107536067218, |
|
"loss": 1.5544, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.016011741944092335, |
|
"grad_norm": 0.4642369747161865, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 1.5329, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01654546667556208, |
|
"grad_norm": 0.4452536404132843, |
|
"learning_rate": 0.00017431448254773944, |
|
"loss": 1.4477, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.017079191407031822, |
|
"grad_norm": 0.4180367588996887, |
|
"learning_rate": 0.0001719339800338651, |
|
"loss": 1.5047, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.017612916138501567, |
|
"grad_norm": 0.34123241901397705, |
|
"learning_rate": 0.00016946583704589973, |
|
"loss": 1.2064, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.018146640869971312, |
|
"grad_norm": 0.4111071228981018, |
|
"learning_rate": 0.00016691306063588583, |
|
"loss": 1.5182, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.018680365601441058, |
|
"grad_norm": 0.3765980899333954, |
|
"learning_rate": 0.00016427876096865394, |
|
"loss": 1.4533, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.018680365601441058, |
|
"eval_loss": 1.2552332878112793, |
|
"eval_runtime": 133.9537, |
|
"eval_samples_per_second": 11.78, |
|
"eval_steps_per_second": 5.89, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.019214090332910803, |
|
"grad_norm": 0.35214561223983765, |
|
"learning_rate": 0.0001615661475325658, |
|
"loss": 1.3882, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.019747815064380544, |
|
"grad_norm": 0.36351558566093445, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 1.2277, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.02028153979585029, |
|
"grad_norm": 0.4065183997154236, |
|
"learning_rate": 0.0001559192903470747, |
|
"loss": 0.9644, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.020815264527320035, |
|
"grad_norm": 0.31548288464546204, |
|
"learning_rate": 0.0001529919264233205, |
|
"loss": 1.2103, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.02134898925878978, |
|
"grad_norm": 0.34691786766052246, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.2275, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.021882713990259525, |
|
"grad_norm": 0.3332786560058594, |
|
"learning_rate": 0.00014694715627858908, |
|
"loss": 1.3469, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.022416438721729267, |
|
"grad_norm": 0.3993981182575226, |
|
"learning_rate": 0.00014383711467890774, |
|
"loss": 1.2165, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.022416438721729267, |
|
"eval_loss": 1.240829586982727, |
|
"eval_runtime": 139.0014, |
|
"eval_samples_per_second": 11.352, |
|
"eval_steps_per_second": 5.676, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.022950163453199012, |
|
"grad_norm": 0.3627195358276367, |
|
"learning_rate": 0.00014067366430758004, |
|
"loss": 1.1394, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.023483888184668757, |
|
"grad_norm": 0.31742554903030396, |
|
"learning_rate": 0.00013746065934159123, |
|
"loss": 1.1885, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.024017612916138503, |
|
"grad_norm": 0.3838454782962799, |
|
"learning_rate": 0.00013420201433256689, |
|
"loss": 1.3044, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.024551337647608248, |
|
"grad_norm": 0.3152655363082886, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 1.2796, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.02508506237907799, |
|
"grad_norm": 0.34708547592163086, |
|
"learning_rate": 0.0001275637355816999, |
|
"loss": 1.2463, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.025618787110547735, |
|
"grad_norm": 0.36045414209365845, |
|
"learning_rate": 0.00012419218955996676, |
|
"loss": 1.4435, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.02615251184201748, |
|
"grad_norm": 0.3756614625453949, |
|
"learning_rate": 0.00012079116908177593, |
|
"loss": 1.1767, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.02615251184201748, |
|
"eval_loss": 1.2297325134277344, |
|
"eval_runtime": 143.2278, |
|
"eval_samples_per_second": 11.017, |
|
"eval_steps_per_second": 5.509, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.026686236573487225, |
|
"grad_norm": 0.4282926023006439, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 1.3075, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.027219961304956967, |
|
"grad_norm": 0.3769337236881256, |
|
"learning_rate": 0.00011391731009600654, |
|
"loss": 1.2137, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.027753686036426712, |
|
"grad_norm": 0.3585554361343384, |
|
"learning_rate": 0.00011045284632676536, |
|
"loss": 1.1048, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.028287410767896457, |
|
"grad_norm": 0.3299272954463959, |
|
"learning_rate": 0.00010697564737441252, |
|
"loss": 1.2815, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.028821135499366202, |
|
"grad_norm": 0.3429305851459503, |
|
"learning_rate": 0.00010348994967025012, |
|
"loss": 1.2823, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.029354860230835948, |
|
"grad_norm": 0.39360129833221436, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0352, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.02988858496230569, |
|
"grad_norm": 0.34267058968544006, |
|
"learning_rate": 9.651005032974994e-05, |
|
"loss": 0.9731, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.02988858496230569, |
|
"eval_loss": 1.2223201990127563, |
|
"eval_runtime": 143.2351, |
|
"eval_samples_per_second": 11.017, |
|
"eval_steps_per_second": 5.508, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.030422309693775434, |
|
"grad_norm": 0.39054739475250244, |
|
"learning_rate": 9.302435262558747e-05, |
|
"loss": 1.1894, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.03095603442524518, |
|
"grad_norm": 0.3081369400024414, |
|
"learning_rate": 8.954715367323468e-05, |
|
"loss": 0.9732, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.031489759156714925, |
|
"grad_norm": 0.43032369017601013, |
|
"learning_rate": 8.608268990399349e-05, |
|
"loss": 0.9788, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.03202348388818467, |
|
"grad_norm": 0.3782951831817627, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 1.1651, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.032557208619654415, |
|
"grad_norm": 0.3738667964935303, |
|
"learning_rate": 7.920883091822408e-05, |
|
"loss": 0.8571, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.03309093335112416, |
|
"grad_norm": 0.3225691616535187, |
|
"learning_rate": 7.580781044003324e-05, |
|
"loss": 0.9112, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.033624658082593906, |
|
"grad_norm": 0.2828129827976227, |
|
"learning_rate": 7.243626441830009e-05, |
|
"loss": 0.8316, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.033624658082593906, |
|
"eval_loss": 1.2188873291015625, |
|
"eval_runtime": 142.4739, |
|
"eval_samples_per_second": 11.076, |
|
"eval_steps_per_second": 5.538, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.034158382814063644, |
|
"grad_norm": 0.4024946391582489, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 1.4719, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.03469210754553339, |
|
"grad_norm": 0.43514999747276306, |
|
"learning_rate": 6.579798566743314e-05, |
|
"loss": 1.3543, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.035225832277003134, |
|
"grad_norm": 0.3096754550933838, |
|
"learning_rate": 6.25393406584088e-05, |
|
"loss": 0.9926, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.03575955700847288, |
|
"grad_norm": 0.3676164448261261, |
|
"learning_rate": 5.9326335692419995e-05, |
|
"loss": 1.5319, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.036293281739942625, |
|
"grad_norm": 0.4387390911579132, |
|
"learning_rate": 5.616288532109225e-05, |
|
"loss": 1.5463, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.03682700647141237, |
|
"grad_norm": 0.38703230023384094, |
|
"learning_rate": 5.305284372141095e-05, |
|
"loss": 1.6212, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.037360731202882115, |
|
"grad_norm": 0.3590051829814911, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 1.3272, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.037360731202882115, |
|
"eval_loss": 1.2139812707901, |
|
"eval_runtime": 148.838, |
|
"eval_samples_per_second": 10.602, |
|
"eval_steps_per_second": 5.301, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03789445593435186, |
|
"grad_norm": 0.3637433350086212, |
|
"learning_rate": 4.700807357667952e-05, |
|
"loss": 1.1114, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.038428180665821605, |
|
"grad_norm": 0.3315064013004303, |
|
"learning_rate": 4.4080709652925336e-05, |
|
"loss": 1.3231, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.038961905397291344, |
|
"grad_norm": 0.4271906912326813, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 1.6108, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.03949563012876109, |
|
"grad_norm": 0.4011762738227844, |
|
"learning_rate": 3.843385246743417e-05, |
|
"loss": 1.3892, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.040029354860230834, |
|
"grad_norm": 0.35736286640167236, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 1.1504, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.04056307959170058, |
|
"grad_norm": 0.29981729388237, |
|
"learning_rate": 3.308693936411421e-05, |
|
"loss": 0.9346, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.041096804323170325, |
|
"grad_norm": 0.32428425550460815, |
|
"learning_rate": 3.053416295410026e-05, |
|
"loss": 1.1467, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.041096804323170325, |
|
"eval_loss": 1.2112306356430054, |
|
"eval_runtime": 132.0592, |
|
"eval_samples_per_second": 11.949, |
|
"eval_steps_per_second": 5.975, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.04163052905464007, |
|
"grad_norm": 0.31357115507125854, |
|
"learning_rate": 2.8066019966134904e-05, |
|
"loss": 1.1449, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.042164253786109815, |
|
"grad_norm": 0.28956103324890137, |
|
"learning_rate": 2.5685517452260567e-05, |
|
"loss": 0.9929, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.04269797851757956, |
|
"grad_norm": 0.3215799927711487, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 0.95, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.043231703249049305, |
|
"grad_norm": 0.32306861877441406, |
|
"learning_rate": 2.119892463932781e-05, |
|
"loss": 1.3405, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.04376542798051905, |
|
"grad_norm": 0.34628739953041077, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 1.3823, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.04429915271198879, |
|
"grad_norm": 0.41880619525909424, |
|
"learning_rate": 1.7096242744495837e-05, |
|
"loss": 1.2145, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.044832877443458534, |
|
"grad_norm": 0.34489524364471436, |
|
"learning_rate": 1.5195190384357404e-05, |
|
"loss": 1.2043, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.044832877443458534, |
|
"eval_loss": 1.209867000579834, |
|
"eval_runtime": 131.1054, |
|
"eval_samples_per_second": 12.036, |
|
"eval_steps_per_second": 6.018, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.04536660217492828, |
|
"grad_norm": 0.3454246520996094, |
|
"learning_rate": 1.339745962155613e-05, |
|
"loss": 1.4326, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.045900326906398024, |
|
"grad_norm": 0.34903326630592346, |
|
"learning_rate": 1.1705240714107302e-05, |
|
"loss": 1.2395, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.04643405163786777, |
|
"grad_norm": 0.36401617527008057, |
|
"learning_rate": 1.0120595370083318e-05, |
|
"loss": 1.1309, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.046967776369337515, |
|
"grad_norm": 0.3476174473762512, |
|
"learning_rate": 8.645454235739903e-06, |
|
"loss": 1.1396, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.04750150110080726, |
|
"grad_norm": 0.3433617353439331, |
|
"learning_rate": 7.281614543321269e-06, |
|
"loss": 1.3106, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.048035225832277005, |
|
"grad_norm": 0.3952890932559967, |
|
"learning_rate": 6.030737921409169e-06, |
|
"loss": 1.3647, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04856895056374675, |
|
"grad_norm": 0.3324783444404602, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 1.3629, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.04856895056374675, |
|
"eval_loss": 1.209067463874817, |
|
"eval_runtime": 131.1887, |
|
"eval_samples_per_second": 12.028, |
|
"eval_steps_per_second": 6.014, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.049102675295216495, |
|
"grad_norm": 0.3305179476737976, |
|
"learning_rate": 3.873830406168111e-06, |
|
"loss": 0.9684, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.049636400026686234, |
|
"grad_norm": 0.3130112290382385, |
|
"learning_rate": 2.970427372400353e-06, |
|
"loss": 1.4122, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.05017012475815598, |
|
"grad_norm": 0.4082207977771759, |
|
"learning_rate": 2.1852399266194314e-06, |
|
"loss": 1.0472, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.050703849489625724, |
|
"grad_norm": 0.37707728147506714, |
|
"learning_rate": 1.5192246987791981e-06, |
|
"loss": 1.3998, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.05123757422109547, |
|
"grad_norm": 0.3092212677001953, |
|
"learning_rate": 9.731931258429638e-07, |
|
"loss": 1.3452, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.051771298952565215, |
|
"grad_norm": 0.31625714898109436, |
|
"learning_rate": 5.478104631726711e-07, |
|
"loss": 1.3001, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.05230502368403496, |
|
"grad_norm": 0.39429420232772827, |
|
"learning_rate": 2.4359497401758024e-07, |
|
"loss": 1.2862, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.05230502368403496, |
|
"eval_loss": 1.2093968391418457, |
|
"eval_runtime": 131.0498, |
|
"eval_samples_per_second": 12.041, |
|
"eval_steps_per_second": 6.021, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.052838748415504705, |
|
"grad_norm": 0.3808484375476837, |
|
"learning_rate": 6.09172980904238e-08, |
|
"loss": 1.3516, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.05337247314697445, |
|
"grad_norm": 0.33221495151519775, |
|
"learning_rate": 0.0, |
|
"loss": 0.9949, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8056777010577408.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|