{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05337247314697445, "eval_steps": 7, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005337247314697444, "grad_norm": 0.3629896640777588, "learning_rate": 2e-05, "loss": 1.3023, "step": 1 }, { "epoch": 0.0005337247314697444, "eval_loss": 1.6696527004241943, "eval_runtime": 138.2798, "eval_samples_per_second": 11.412, "eval_steps_per_second": 5.706, "step": 1 }, { "epoch": 0.0010674494629394889, "grad_norm": 0.37523019313812256, "learning_rate": 4e-05, "loss": 1.3966, "step": 2 }, { "epoch": 0.0016011741944092334, "grad_norm": 0.39908841252326965, "learning_rate": 6e-05, "loss": 1.2988, "step": 3 }, { "epoch": 0.0021348989258789777, "grad_norm": 0.44918331503868103, "learning_rate": 8e-05, "loss": 1.8351, "step": 4 }, { "epoch": 0.0026686236573487225, "grad_norm": 0.4540751874446869, "learning_rate": 0.0001, "loss": 1.6159, "step": 5 }, { "epoch": 0.003202348388818467, "grad_norm": 0.48438403010368347, "learning_rate": 0.00012, "loss": 1.7997, "step": 6 }, { "epoch": 0.003736073120288211, "grad_norm": 0.47967973351478577, "learning_rate": 0.00014, "loss": 1.848, "step": 7 }, { "epoch": 0.003736073120288211, "eval_loss": 1.5880990028381348, "eval_runtime": 136.8431, "eval_samples_per_second": 11.531, "eval_steps_per_second": 5.766, "step": 7 }, { "epoch": 0.0042697978517579555, "grad_norm": 0.443247526884079, "learning_rate": 0.00016, "loss": 1.3893, "step": 8 }, { "epoch": 0.004803522583227701, "grad_norm": 0.4176405370235443, "learning_rate": 0.00018, "loss": 1.6137, "step": 9 }, { "epoch": 0.005337247314697445, "grad_norm": 0.42031896114349365, "learning_rate": 0.0002, "loss": 1.5092, "step": 10 }, { "epoch": 0.005870972046167189, "grad_norm": 0.3552657961845398, "learning_rate": 0.0001999390827019096, "loss": 1.4142, "step": 11 }, { "epoch": 0.006404696777636934, "grad_norm": 0.5527915358543396, "learning_rate": 0.00019975640502598244, "loss": 1.6114, "step": 12 }, { "epoch": 0.006938421509106678, "grad_norm": 0.4900396168231964, "learning_rate": 0.00019945218953682734, "loss": 1.3065, "step": 13 }, { "epoch": 0.007472146240576422, "grad_norm": 0.41337817907333374, "learning_rate": 0.00019902680687415705, "loss": 1.3956, "step": 14 }, { "epoch": 0.007472146240576422, "eval_loss": 1.3877164125442505, "eval_runtime": 132.633, "eval_samples_per_second": 11.897, "eval_steps_per_second": 5.949, "step": 14 }, { "epoch": 0.008005870972046168, "grad_norm": 0.44572773575782776, "learning_rate": 0.00019848077530122083, "loss": 1.5852, "step": 15 }, { "epoch": 0.008539595703515911, "grad_norm": 0.413870632648468, "learning_rate": 0.00019781476007338058, "loss": 1.2316, "step": 16 }, { "epoch": 0.009073320434985656, "grad_norm": 0.41110166907310486, "learning_rate": 0.00019702957262759965, "loss": 1.4972, "step": 17 }, { "epoch": 0.009607045166455401, "grad_norm": 0.3463555872440338, "learning_rate": 0.0001961261695938319, "loss": 1.1697, "step": 18 }, { "epoch": 0.010140769897925145, "grad_norm": 0.3517301678657532, "learning_rate": 0.00019510565162951537, "loss": 1.2792, "step": 19 }, { "epoch": 0.01067449462939489, "grad_norm": 0.3800961375236511, "learning_rate": 0.00019396926207859084, "loss": 1.3721, "step": 20 }, { "epoch": 0.011208219360864633, "grad_norm": 0.46215319633483887, "learning_rate": 0.00019271838545667876, "loss": 1.2391, "step": 21 }, { "epoch": 0.011208219360864633, "eval_loss": 1.3272384405136108, "eval_runtime": 142.2392, "eval_samples_per_second": 11.094, "eval_steps_per_second": 5.547, "step": 21 }, { "epoch": 0.011741944092334379, "grad_norm": 0.4401664733886719, "learning_rate": 0.0001913545457642601, "loss": 1.279, "step": 22 }, { "epoch": 0.012275668823804124, "grad_norm": 0.4027818441390991, "learning_rate": 0.0001898794046299167, "loss": 1.3049, "step": 23 }, { "epoch": 0.012809393555273867, "grad_norm": 0.4447319209575653, "learning_rate": 0.00018829475928589271, "loss": 1.5177, "step": 24 }, { "epoch": 0.013343118286743613, "grad_norm": 0.4240773618221283, "learning_rate": 0.00018660254037844388, "loss": 1.2123, "step": 25 }, { "epoch": 0.013876843018213356, "grad_norm": 0.39182350039482117, "learning_rate": 0.0001848048096156426, "loss": 1.2975, "step": 26 }, { "epoch": 0.014410567749683101, "grad_norm": 0.3561626374721527, "learning_rate": 0.00018290375725550417, "loss": 1.3176, "step": 27 }, { "epoch": 0.014944292481152845, "grad_norm": 0.41868481040000916, "learning_rate": 0.00018090169943749476, "loss": 1.497, "step": 28 }, { "epoch": 0.014944292481152845, "eval_loss": 1.2778455018997192, "eval_runtime": 133.835, "eval_samples_per_second": 11.791, "eval_steps_per_second": 5.895, "step": 28 }, { "epoch": 0.01547801721262259, "grad_norm": 0.39559242129325867, "learning_rate": 0.00017880107536067218, "loss": 1.5544, "step": 29 }, { "epoch": 0.016011741944092335, "grad_norm": 0.4642369747161865, "learning_rate": 0.0001766044443118978, "loss": 1.5329, "step": 30 }, { "epoch": 0.01654546667556208, "grad_norm": 0.4452536404132843, "learning_rate": 0.00017431448254773944, "loss": 1.4477, "step": 31 }, { "epoch": 0.017079191407031822, "grad_norm": 0.4180367588996887, "learning_rate": 0.0001719339800338651, "loss": 1.5047, "step": 32 }, { "epoch": 0.017612916138501567, "grad_norm": 0.34123241901397705, "learning_rate": 0.00016946583704589973, "loss": 1.2064, "step": 33 }, { "epoch": 0.018146640869971312, "grad_norm": 0.4111071228981018, "learning_rate": 0.00016691306063588583, "loss": 1.5182, "step": 34 }, { "epoch": 0.018680365601441058, "grad_norm": 0.3765980899333954, "learning_rate": 0.00016427876096865394, "loss": 1.4533, "step": 35 }, { "epoch": 0.018680365601441058, "eval_loss": 1.2552332878112793, "eval_runtime": 133.9537, "eval_samples_per_second": 11.78, "eval_steps_per_second": 5.89, "step": 35 }, { "epoch": 0.019214090332910803, "grad_norm": 0.35214561223983765, "learning_rate": 0.0001615661475325658, "loss": 1.3882, "step": 36 }, { "epoch": 0.019747815064380544, "grad_norm": 0.36351558566093445, "learning_rate": 0.00015877852522924732, "loss": 1.2277, "step": 37 }, { "epoch": 0.02028153979585029, "grad_norm": 0.4065183997154236, "learning_rate": 0.0001559192903470747, "loss": 0.9644, "step": 38 }, { "epoch": 0.020815264527320035, "grad_norm": 0.31548288464546204, "learning_rate": 0.0001529919264233205, "loss": 1.2103, "step": 39 }, { "epoch": 0.02134898925878978, "grad_norm": 0.34691786766052246, "learning_rate": 0.00015000000000000001, "loss": 1.2275, "step": 40 }, { "epoch": 0.021882713990259525, "grad_norm": 0.3332786560058594, "learning_rate": 0.00014694715627858908, "loss": 1.3469, "step": 41 }, { "epoch": 0.022416438721729267, "grad_norm": 0.3993981182575226, "learning_rate": 0.00014383711467890774, "loss": 1.2165, "step": 42 }, { "epoch": 0.022416438721729267, "eval_loss": 1.240829586982727, "eval_runtime": 139.0014, "eval_samples_per_second": 11.352, "eval_steps_per_second": 5.676, "step": 42 }, { "epoch": 0.022950163453199012, "grad_norm": 0.3627195358276367, "learning_rate": 0.00014067366430758004, "loss": 1.1394, "step": 43 }, { "epoch": 0.023483888184668757, "grad_norm": 0.31742554903030396, "learning_rate": 0.00013746065934159123, "loss": 1.1885, "step": 44 }, { "epoch": 0.024017612916138503, "grad_norm": 0.3838454782962799, "learning_rate": 0.00013420201433256689, "loss": 1.3044, "step": 45 }, { "epoch": 0.024551337647608248, "grad_norm": 0.3152655363082886, "learning_rate": 0.00013090169943749476, "loss": 1.2796, "step": 46 }, { "epoch": 0.02508506237907799, "grad_norm": 0.34708547592163086, "learning_rate": 0.0001275637355816999, "loss": 1.2463, "step": 47 }, { "epoch": 0.025618787110547735, "grad_norm": 0.36045414209365845, "learning_rate": 0.00012419218955996676, "loss": 1.4435, "step": 48 }, { "epoch": 0.02615251184201748, "grad_norm": 0.3756614625453949, "learning_rate": 0.00012079116908177593, "loss": 1.1767, "step": 49 }, { "epoch": 0.02615251184201748, "eval_loss": 1.2297325134277344, "eval_runtime": 143.2278, "eval_samples_per_second": 11.017, "eval_steps_per_second": 5.509, "step": 49 }, { "epoch": 0.026686236573487225, "grad_norm": 0.4282926023006439, "learning_rate": 0.00011736481776669306, "loss": 1.3075, "step": 50 }, { "epoch": 0.027219961304956967, "grad_norm": 0.3769337236881256, "learning_rate": 0.00011391731009600654, "loss": 1.2137, "step": 51 }, { "epoch": 0.027753686036426712, "grad_norm": 0.3585554361343384, "learning_rate": 0.00011045284632676536, "loss": 1.1048, "step": 52 }, { "epoch": 0.028287410767896457, "grad_norm": 0.3299272954463959, "learning_rate": 0.00010697564737441252, "loss": 1.2815, "step": 53 }, { "epoch": 0.028821135499366202, "grad_norm": 0.3429305851459503, "learning_rate": 0.00010348994967025012, "loss": 1.2823, "step": 54 }, { "epoch": 0.029354860230835948, "grad_norm": 0.39360129833221436, "learning_rate": 0.0001, "loss": 1.0352, "step": 55 }, { "epoch": 0.02988858496230569, "grad_norm": 0.34267058968544006, "learning_rate": 9.651005032974994e-05, "loss": 0.9731, "step": 56 }, { "epoch": 0.02988858496230569, "eval_loss": 1.2223201990127563, "eval_runtime": 143.2351, "eval_samples_per_second": 11.017, "eval_steps_per_second": 5.508, "step": 56 }, { "epoch": 0.030422309693775434, "grad_norm": 0.39054739475250244, "learning_rate": 9.302435262558747e-05, "loss": 1.1894, "step": 57 }, { "epoch": 0.03095603442524518, "grad_norm": 0.3081369400024414, "learning_rate": 8.954715367323468e-05, "loss": 0.9732, "step": 58 }, { "epoch": 0.031489759156714925, "grad_norm": 0.43032369017601013, "learning_rate": 8.608268990399349e-05, "loss": 0.9788, "step": 59 }, { "epoch": 0.03202348388818467, "grad_norm": 0.3782951831817627, "learning_rate": 8.263518223330697e-05, "loss": 1.1651, "step": 60 }, { "epoch": 0.032557208619654415, "grad_norm": 0.3738667964935303, "learning_rate": 7.920883091822408e-05, "loss": 0.8571, "step": 61 }, { "epoch": 0.03309093335112416, "grad_norm": 0.3225691616535187, "learning_rate": 7.580781044003324e-05, "loss": 0.9112, "step": 62 }, { "epoch": 0.033624658082593906, "grad_norm": 0.2828129827976227, "learning_rate": 7.243626441830009e-05, "loss": 0.8316, "step": 63 }, { "epoch": 0.033624658082593906, "eval_loss": 1.2188873291015625, "eval_runtime": 142.4739, "eval_samples_per_second": 11.076, "eval_steps_per_second": 5.538, "step": 63 }, { "epoch": 0.034158382814063644, "grad_norm": 0.4024946391582489, "learning_rate": 6.909830056250527e-05, "loss": 1.4719, "step": 64 }, { "epoch": 0.03469210754553339, "grad_norm": 0.43514999747276306, "learning_rate": 6.579798566743314e-05, "loss": 1.3543, "step": 65 }, { "epoch": 0.035225832277003134, "grad_norm": 0.3096754550933838, "learning_rate": 6.25393406584088e-05, "loss": 0.9926, "step": 66 }, { "epoch": 0.03575955700847288, "grad_norm": 0.3676164448261261, "learning_rate": 5.9326335692419995e-05, "loss": 1.5319, "step": 67 }, { "epoch": 0.036293281739942625, "grad_norm": 0.4387390911579132, "learning_rate": 5.616288532109225e-05, "loss": 1.5463, "step": 68 }, { "epoch": 0.03682700647141237, "grad_norm": 0.38703230023384094, "learning_rate": 5.305284372141095e-05, "loss": 1.6212, "step": 69 }, { "epoch": 0.037360731202882115, "grad_norm": 0.3590051829814911, "learning_rate": 5.000000000000002e-05, "loss": 1.3272, "step": 70 }, { "epoch": 0.037360731202882115, "eval_loss": 1.2139812707901, "eval_runtime": 148.838, "eval_samples_per_second": 10.602, "eval_steps_per_second": 5.301, "step": 70 }, { "epoch": 0.03789445593435186, "grad_norm": 0.3637433350086212, "learning_rate": 4.700807357667952e-05, "loss": 1.1114, "step": 71 }, { "epoch": 0.038428180665821605, "grad_norm": 0.3315064013004303, "learning_rate": 4.4080709652925336e-05, "loss": 1.3231, "step": 72 }, { "epoch": 0.038961905397291344, "grad_norm": 0.4271906912326813, "learning_rate": 4.12214747707527e-05, "loss": 1.6108, "step": 73 }, { "epoch": 0.03949563012876109, "grad_norm": 0.4011762738227844, "learning_rate": 3.843385246743417e-05, "loss": 1.3892, "step": 74 }, { "epoch": 0.040029354860230834, "grad_norm": 0.35736286640167236, "learning_rate": 3.5721239031346066e-05, "loss": 1.1504, "step": 75 }, { "epoch": 0.04056307959170058, "grad_norm": 0.29981729388237, "learning_rate": 3.308693936411421e-05, "loss": 0.9346, "step": 76 }, { "epoch": 0.041096804323170325, "grad_norm": 0.32428425550460815, "learning_rate": 3.053416295410026e-05, "loss": 1.1467, "step": 77 }, { "epoch": 0.041096804323170325, "eval_loss": 1.2112306356430054, "eval_runtime": 132.0592, "eval_samples_per_second": 11.949, "eval_steps_per_second": 5.975, "step": 77 }, { "epoch": 0.04163052905464007, "grad_norm": 0.31357115507125854, "learning_rate": 2.8066019966134904e-05, "loss": 1.1449, "step": 78 }, { "epoch": 0.042164253786109815, "grad_norm": 0.28956103324890137, "learning_rate": 2.5685517452260567e-05, "loss": 0.9929, "step": 79 }, { "epoch": 0.04269797851757956, "grad_norm": 0.3215799927711487, "learning_rate": 2.339555568810221e-05, "loss": 0.95, "step": 80 }, { "epoch": 0.043231703249049305, "grad_norm": 0.32306861877441406, "learning_rate": 2.119892463932781e-05, "loss": 1.3405, "step": 81 }, { "epoch": 0.04376542798051905, "grad_norm": 0.34628739953041077, "learning_rate": 1.9098300562505266e-05, "loss": 1.3823, "step": 82 }, { "epoch": 0.04429915271198879, "grad_norm": 0.41880619525909424, "learning_rate": 1.7096242744495837e-05, "loss": 1.2145, "step": 83 }, { "epoch": 0.044832877443458534, "grad_norm": 0.34489524364471436, "learning_rate": 1.5195190384357404e-05, "loss": 1.2043, "step": 84 }, { "epoch": 0.044832877443458534, "eval_loss": 1.209867000579834, "eval_runtime": 131.1054, "eval_samples_per_second": 12.036, "eval_steps_per_second": 6.018, "step": 84 }, { "epoch": 0.04536660217492828, "grad_norm": 0.3454246520996094, "learning_rate": 1.339745962155613e-05, "loss": 1.4326, "step": 85 }, { "epoch": 0.045900326906398024, "grad_norm": 0.34903326630592346, "learning_rate": 1.1705240714107302e-05, "loss": 1.2395, "step": 86 }, { "epoch": 0.04643405163786777, "grad_norm": 0.36401617527008057, "learning_rate": 1.0120595370083318e-05, "loss": 1.1309, "step": 87 }, { "epoch": 0.046967776369337515, "grad_norm": 0.3476174473762512, "learning_rate": 8.645454235739903e-06, "loss": 1.1396, "step": 88 }, { "epoch": 0.04750150110080726, "grad_norm": 0.3433617353439331, "learning_rate": 7.281614543321269e-06, "loss": 1.3106, "step": 89 }, { "epoch": 0.048035225832277005, "grad_norm": 0.3952890932559967, "learning_rate": 6.030737921409169e-06, "loss": 1.3647, "step": 90 }, { "epoch": 0.04856895056374675, "grad_norm": 0.3324783444404602, "learning_rate": 4.8943483704846475e-06, "loss": 1.3629, "step": 91 }, { "epoch": 0.04856895056374675, "eval_loss": 1.209067463874817, "eval_runtime": 131.1887, "eval_samples_per_second": 12.028, "eval_steps_per_second": 6.014, "step": 91 }, { "epoch": 0.049102675295216495, "grad_norm": 0.3305179476737976, "learning_rate": 3.873830406168111e-06, "loss": 0.9684, "step": 92 }, { "epoch": 0.049636400026686234, "grad_norm": 0.3130112290382385, "learning_rate": 2.970427372400353e-06, "loss": 1.4122, "step": 93 }, { "epoch": 0.05017012475815598, "grad_norm": 0.4082207977771759, "learning_rate": 2.1852399266194314e-06, "loss": 1.0472, "step": 94 }, { "epoch": 0.050703849489625724, "grad_norm": 0.37707728147506714, "learning_rate": 1.5192246987791981e-06, "loss": 1.3998, "step": 95 }, { "epoch": 0.05123757422109547, "grad_norm": 0.3092212677001953, "learning_rate": 9.731931258429638e-07, "loss": 1.3452, "step": 96 }, { "epoch": 0.051771298952565215, "grad_norm": 0.31625714898109436, "learning_rate": 5.478104631726711e-07, "loss": 1.3001, "step": 97 }, { "epoch": 0.05230502368403496, "grad_norm": 0.39429420232772827, "learning_rate": 2.4359497401758024e-07, "loss": 1.2862, "step": 98 }, { "epoch": 0.05230502368403496, "eval_loss": 1.2093968391418457, "eval_runtime": 131.0498, "eval_samples_per_second": 12.041, "eval_steps_per_second": 6.021, "step": 98 }, { "epoch": 0.052838748415504705, "grad_norm": 0.3808484375476837, "learning_rate": 6.09172980904238e-08, "loss": 1.3516, "step": 99 }, { "epoch": 0.05337247314697445, "grad_norm": 0.33221495151519775, "learning_rate": 0.0, "loss": 0.9949, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8056777010577408.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }