{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9631901840490797, "eval_steps": 41, "global_step": 326, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006134969325153374, "grad_norm": 0.10048680007457733, "learning_rate": 1.0000000000000002e-06, "loss": 0.6143, "step": 1 }, { "epoch": 0.006134969325153374, "eval_loss": 0.49658429622650146, "eval_runtime": 22.5462, "eval_samples_per_second": 8.161, "eval_steps_per_second": 1.02, "step": 1 }, { "epoch": 0.012269938650306749, "grad_norm": 0.08257655799388885, "learning_rate": 2.0000000000000003e-06, "loss": 0.4536, "step": 2 }, { "epoch": 0.018404907975460124, "grad_norm": 0.10207108408212662, "learning_rate": 3e-06, "loss": 0.5374, "step": 3 }, { "epoch": 0.024539877300613498, "grad_norm": 0.07208588719367981, "learning_rate": 4.000000000000001e-06, "loss": 0.4189, "step": 4 }, { "epoch": 0.03067484662576687, "grad_norm": 0.08989891409873962, "learning_rate": 5e-06, "loss": 0.4773, "step": 5 }, { "epoch": 0.03680981595092025, "grad_norm": 0.07754336297512054, "learning_rate": 6e-06, "loss": 0.5562, "step": 6 }, { "epoch": 0.04294478527607362, "grad_norm": 0.10505057871341705, "learning_rate": 7e-06, "loss": 0.5093, "step": 7 }, { "epoch": 0.049079754601226995, "grad_norm": 0.07071765512228012, "learning_rate": 8.000000000000001e-06, "loss": 0.4337, "step": 8 }, { "epoch": 0.05521472392638037, "grad_norm": 0.0959077998995781, "learning_rate": 9e-06, "loss": 0.5716, "step": 9 }, { "epoch": 0.06134969325153374, "grad_norm": 0.08926673978567123, "learning_rate": 1e-05, "loss": 0.5525, "step": 10 }, { "epoch": 0.06748466257668712, "grad_norm": 0.09146387130022049, "learning_rate": 9.999752906107043e-06, "loss": 0.4638, "step": 11 }, { "epoch": 0.0736196319018405, "grad_norm": 0.08329001069068909, "learning_rate": 9.999011648850328e-06, "loss": 0.3994, "step": 12 }, { "epoch": 0.07975460122699386, "grad_norm": 0.09074167907238007, "learning_rate": 9.997776301493914e-06, "loss": 0.6013, "step": 13 }, { "epoch": 0.08588957055214724, "grad_norm": 0.08691710978746414, "learning_rate": 9.99604698613651e-06, "loss": 0.4411, "step": 14 }, { "epoch": 0.09202453987730061, "grad_norm": 0.09161891788244247, "learning_rate": 9.993823873699427e-06, "loss": 0.5256, "step": 15 }, { "epoch": 0.09815950920245399, "grad_norm": 0.10627757757902145, "learning_rate": 9.991107183909665e-06, "loss": 0.6318, "step": 16 }, { "epoch": 0.10429447852760736, "grad_norm": 0.09262832999229431, "learning_rate": 9.98789718527821e-06, "loss": 0.5003, "step": 17 }, { "epoch": 0.11042944785276074, "grad_norm": 0.11589068174362183, "learning_rate": 9.98419419507348e-06, "loss": 0.7078, "step": 18 }, { "epoch": 0.1165644171779141, "grad_norm": 0.10624095797538757, "learning_rate": 9.979998579289985e-06, "loss": 0.6702, "step": 19 }, { "epoch": 0.12269938650306748, "grad_norm": 0.06414097547531128, "learning_rate": 9.975310752612138e-06, "loss": 0.353, "step": 20 }, { "epoch": 0.12883435582822086, "grad_norm": 0.11902766674757004, "learning_rate": 9.970131178373276e-06, "loss": 0.5749, "step": 21 }, { "epoch": 0.13496932515337423, "grad_norm": 0.09129296988248825, "learning_rate": 9.964460368509868e-06, "loss": 0.4984, "step": 22 }, { "epoch": 0.1411042944785276, "grad_norm": 0.09726156294345856, "learning_rate": 9.958298883510904e-06, "loss": 0.476, "step": 23 }, { "epoch": 0.147239263803681, "grad_norm": 0.09123346209526062, "learning_rate": 9.951647332362511e-06, "loss": 0.4627, "step": 24 }, { "epoch": 0.15337423312883436, "grad_norm": 0.11005334556102753, "learning_rate": 9.944506372487754e-06, "loss": 0.6342, "step": 25 }, { "epoch": 0.15950920245398773, "grad_norm": 0.08414352685213089, "learning_rate": 9.936876709681668e-06, "loss": 0.4311, "step": 26 }, { "epoch": 0.1656441717791411, "grad_norm": 0.07989637553691864, "learning_rate": 9.928759098041482e-06, "loss": 0.5064, "step": 27 }, { "epoch": 0.17177914110429449, "grad_norm": 0.08479982614517212, "learning_rate": 9.920154339892104e-06, "loss": 0.406, "step": 28 }, { "epoch": 0.17791411042944785, "grad_norm": 0.09588364511728287, "learning_rate": 9.911063285706808e-06, "loss": 0.5324, "step": 29 }, { "epoch": 0.18404907975460122, "grad_norm": 0.10101877897977829, "learning_rate": 9.901486834023182e-06, "loss": 0.5151, "step": 30 }, { "epoch": 0.1901840490797546, "grad_norm": 0.10507562756538391, "learning_rate": 9.891425931354316e-06, "loss": 0.5903, "step": 31 }, { "epoch": 0.19631901840490798, "grad_norm": 0.09090801328420639, "learning_rate": 9.880881572095255e-06, "loss": 0.326, "step": 32 }, { "epoch": 0.20245398773006135, "grad_norm": 0.08976439386606216, "learning_rate": 9.869854798424709e-06, "loss": 0.3729, "step": 33 }, { "epoch": 0.2085889570552147, "grad_norm": 0.10010894387960434, "learning_rate": 9.85834670020205e-06, "loss": 0.4083, "step": 34 }, { "epoch": 0.2147239263803681, "grad_norm": 0.11611686646938324, "learning_rate": 9.846358414859598e-06, "loss": 0.46, "step": 35 }, { "epoch": 0.22085889570552147, "grad_norm": 0.08281069248914719, "learning_rate": 9.833891127290186e-06, "loss": 0.4027, "step": 36 }, { "epoch": 0.22699386503067484, "grad_norm": 0.07023394107818604, "learning_rate": 9.820946069730067e-06, "loss": 0.3775, "step": 37 }, { "epoch": 0.2331288343558282, "grad_norm": 0.07550126314163208, "learning_rate": 9.807524521637103e-06, "loss": 0.3316, "step": 38 }, { "epoch": 0.2392638036809816, "grad_norm": 0.09305860102176666, "learning_rate": 9.793627809564324e-06, "loss": 0.5796, "step": 39 }, { "epoch": 0.24539877300613497, "grad_norm": 0.07182462513446808, "learning_rate": 9.779257307028805e-06, "loss": 0.389, "step": 40 }, { "epoch": 0.25153374233128833, "grad_norm": 0.0985904186964035, "learning_rate": 9.76441443437591e-06, "loss": 0.5509, "step": 41 }, { "epoch": 0.25153374233128833, "eval_loss": 0.4608064889907837, "eval_runtime": 22.8858, "eval_samples_per_second": 8.04, "eval_steps_per_second": 1.005, "step": 41 }, { "epoch": 0.25766871165644173, "grad_norm": 0.07043192535638809, "learning_rate": 9.749100658638914e-06, "loss": 0.3804, "step": 42 }, { "epoch": 0.26380368098159507, "grad_norm": 0.0967957079410553, "learning_rate": 9.733317493394004e-06, "loss": 0.5316, "step": 43 }, { "epoch": 0.26993865030674846, "grad_norm": 0.07897109538316727, "learning_rate": 9.717066498610673e-06, "loss": 0.4192, "step": 44 }, { "epoch": 0.27607361963190186, "grad_norm": 0.10683301091194153, "learning_rate": 9.700349280497552e-06, "loss": 0.5923, "step": 45 }, { "epoch": 0.2822085889570552, "grad_norm": 0.07347994297742844, "learning_rate": 9.68316749134364e-06, "loss": 0.3696, "step": 46 }, { "epoch": 0.2883435582822086, "grad_norm": 0.0915137529373169, "learning_rate": 9.665522829355005e-06, "loss": 0.5799, "step": 47 }, { "epoch": 0.294478527607362, "grad_norm": 0.08099963515996933, "learning_rate": 9.647417038486936e-06, "loss": 0.4504, "step": 48 }, { "epoch": 0.3006134969325153, "grad_norm": 0.11759970337152481, "learning_rate": 9.628851908271572e-06, "loss": 0.6128, "step": 49 }, { "epoch": 0.3067484662576687, "grad_norm": 0.11251598596572876, "learning_rate": 9.609829273641034e-06, "loss": 0.5497, "step": 50 }, { "epoch": 0.3128834355828221, "grad_norm": 0.08890489488840103, "learning_rate": 9.590351014746059e-06, "loss": 0.5293, "step": 51 }, { "epoch": 0.31901840490797545, "grad_norm": 0.08688578754663467, "learning_rate": 9.570419056770174e-06, "loss": 0.4923, "step": 52 }, { "epoch": 0.32515337423312884, "grad_norm": 0.08014731109142303, "learning_rate": 9.550035369739416e-06, "loss": 0.4639, "step": 53 }, { "epoch": 0.3312883435582822, "grad_norm": 0.08636078238487244, "learning_rate": 9.529201968327618e-06, "loss": 0.3038, "step": 54 }, { "epoch": 0.3374233128834356, "grad_norm": 0.06179236248135567, "learning_rate": 9.50792091165728e-06, "loss": 0.3302, "step": 55 }, { "epoch": 0.34355828220858897, "grad_norm": 0.06599953025579453, "learning_rate": 9.486194303096062e-06, "loss": 0.2632, "step": 56 }, { "epoch": 0.3496932515337423, "grad_norm": 0.06800541281700134, "learning_rate": 9.464024290048879e-06, "loss": 0.4134, "step": 57 }, { "epoch": 0.3558282208588957, "grad_norm": 0.07703054696321487, "learning_rate": 9.44141306374566e-06, "loss": 0.4175, "step": 58 }, { "epoch": 0.3619631901840491, "grad_norm": 0.06939573585987091, "learning_rate": 9.418362859024781e-06, "loss": 0.3076, "step": 59 }, { "epoch": 0.36809815950920244, "grad_norm": 0.07432107627391815, "learning_rate": 9.39487595411217e-06, "loss": 0.4667, "step": 60 }, { "epoch": 0.37423312883435583, "grad_norm": 0.08580990135669708, "learning_rate": 9.37095467039613e-06, "loss": 0.4878, "step": 61 }, { "epoch": 0.3803680981595092, "grad_norm": 0.07800393551588058, "learning_rate": 9.346601372197914e-06, "loss": 0.4484, "step": 62 }, { "epoch": 0.38650306748466257, "grad_norm": 0.08790121227502823, "learning_rate": 9.32181846653802e-06, "loss": 0.5667, "step": 63 }, { "epoch": 0.39263803680981596, "grad_norm": 0.07062779366970062, "learning_rate": 9.296608402898306e-06, "loss": 0.3125, "step": 64 }, { "epoch": 0.3987730061349693, "grad_norm": 0.08514374494552612, "learning_rate": 9.270973672979877e-06, "loss": 0.5389, "step": 65 }, { "epoch": 0.4049079754601227, "grad_norm": 0.09363362193107605, "learning_rate": 9.244916810456822e-06, "loss": 0.4613, "step": 66 }, { "epoch": 0.4110429447852761, "grad_norm": 0.08358146995306015, "learning_rate": 9.218440390725772e-06, "loss": 0.422, "step": 67 }, { "epoch": 0.4171779141104294, "grad_norm": 0.06681355088949203, "learning_rate": 9.191547030651383e-06, "loss": 0.4099, "step": 68 }, { "epoch": 0.4233128834355828, "grad_norm": 0.07471272349357605, "learning_rate": 9.164239388307668e-06, "loss": 0.3876, "step": 69 }, { "epoch": 0.4294478527607362, "grad_norm": 0.08724360167980194, "learning_rate": 9.136520162715288e-06, "loss": 0.493, "step": 70 }, { "epoch": 0.43558282208588955, "grad_norm": 0.06159456446766853, "learning_rate": 9.108392093574785e-06, "loss": 0.2673, "step": 71 }, { "epoch": 0.44171779141104295, "grad_norm": 0.0887589380145073, "learning_rate": 9.079857960995806e-06, "loss": 0.4282, "step": 72 }, { "epoch": 0.44785276073619634, "grad_norm": 0.08436176180839539, "learning_rate": 9.050920585222309e-06, "loss": 0.503, "step": 73 }, { "epoch": 0.4539877300613497, "grad_norm": 0.08223728090524673, "learning_rate": 9.021582826353825e-06, "loss": 0.5367, "step": 74 }, { "epoch": 0.4601226993865031, "grad_norm": 0.07190907746553421, "learning_rate": 8.991847584062776e-06, "loss": 0.4085, "step": 75 }, { "epoch": 0.4662576687116564, "grad_norm": 0.0850868821144104, "learning_rate": 8.961717797307872e-06, "loss": 0.4974, "step": 76 }, { "epoch": 0.4723926380368098, "grad_norm": 0.1070251539349556, "learning_rate": 8.931196444043635e-06, "loss": 0.5619, "step": 77 }, { "epoch": 0.4785276073619632, "grad_norm": 0.07099828869104385, "learning_rate": 8.900286540926062e-06, "loss": 0.431, "step": 78 }, { "epoch": 0.48466257668711654, "grad_norm": 0.06902176886796951, "learning_rate": 8.868991143014469e-06, "loss": 0.3375, "step": 79 }, { "epoch": 0.49079754601226994, "grad_norm": 0.11233729869127274, "learning_rate": 8.83731334346954e-06, "loss": 0.5159, "step": 80 }, { "epoch": 0.49693251533742333, "grad_norm": 0.07816348224878311, "learning_rate": 8.805256273247597e-06, "loss": 0.4657, "step": 81 }, { "epoch": 0.5030674846625767, "grad_norm": 0.08207116276025772, "learning_rate": 8.772823100791152e-06, "loss": 0.5891, "step": 82 }, { "epoch": 0.5030674846625767, "eval_loss": 0.4219532012939453, "eval_runtime": 52.8567, "eval_samples_per_second": 3.481, "eval_steps_per_second": 0.435, "step": 82 }, { "epoch": 0.50920245398773, "grad_norm": 0.1063007116317749, "learning_rate": 8.74001703171574e-06, "loss": 0.4719, "step": 83 }, { "epoch": 0.5153374233128835, "grad_norm": 0.06525219976902008, "learning_rate": 8.706841308493092e-06, "loss": 0.2945, "step": 84 }, { "epoch": 0.5214723926380368, "grad_norm": 0.084320567548275, "learning_rate": 8.673299210130647e-06, "loss": 0.5193, "step": 85 }, { "epoch": 0.5276073619631901, "grad_norm": 0.09182199090719223, "learning_rate": 8.639394051847472e-06, "loss": 0.4065, "step": 86 }, { "epoch": 0.5337423312883436, "grad_norm": 0.08341530710458755, "learning_rate": 8.605129184746586e-06, "loss": 0.5126, "step": 87 }, { "epoch": 0.5398773006134969, "grad_norm": 0.0803583562374115, "learning_rate": 8.57050799548375e-06, "loss": 0.4048, "step": 88 }, { "epoch": 0.5460122699386503, "grad_norm": 0.10489048063755035, "learning_rate": 8.535533905932739e-06, "loss": 0.4255, "step": 89 }, { "epoch": 0.5521472392638037, "grad_norm": 0.09766017645597458, "learning_rate": 8.500210372847128e-06, "loss": 0.5091, "step": 90 }, { "epoch": 0.558282208588957, "grad_norm": 0.06364760547876358, "learning_rate": 8.464540887518638e-06, "loss": 0.2591, "step": 91 }, { "epoch": 0.5644171779141104, "grad_norm": 0.07464785873889923, "learning_rate": 8.428528975432067e-06, "loss": 0.3407, "step": 92 }, { "epoch": 0.5705521472392638, "grad_norm": 0.08410008996725082, "learning_rate": 8.392178195916832e-06, "loss": 0.4741, "step": 93 }, { "epoch": 0.5766871165644172, "grad_norm": 0.08058732002973557, "learning_rate": 8.355492141795185e-06, "loss": 0.373, "step": 94 }, { "epoch": 0.5828220858895705, "grad_norm": 0.10042490810155869, "learning_rate": 8.318474439027096e-06, "loss": 0.4643, "step": 95 }, { "epoch": 0.588957055214724, "grad_norm": 0.06268122047185898, "learning_rate": 8.281128746351878e-06, "loss": 0.3071, "step": 96 }, { "epoch": 0.5950920245398773, "grad_norm": 0.08433736115694046, "learning_rate": 8.24345875492657e-06, "loss": 0.4042, "step": 97 }, { "epoch": 0.6012269938650306, "grad_norm": 0.10664895176887512, "learning_rate": 8.2054681879611e-06, "loss": 0.5345, "step": 98 }, { "epoch": 0.6073619631901841, "grad_norm": 0.08828828483819962, "learning_rate": 8.167160800350306e-06, "loss": 0.4128, "step": 99 }, { "epoch": 0.6134969325153374, "grad_norm": 0.09366817027330399, "learning_rate": 8.1285403783028e-06, "loss": 0.6111, "step": 100 }, { "epoch": 0.6196319018404908, "grad_norm": 0.07751886546611786, "learning_rate": 8.089610738966754e-06, "loss": 0.4008, "step": 101 }, { "epoch": 0.6257668711656442, "grad_norm": 0.09029467403888702, "learning_rate": 8.050375730052622e-06, "loss": 0.5109, "step": 102 }, { "epoch": 0.6319018404907976, "grad_norm": 0.09237638860940933, "learning_rate": 8.010839229452843e-06, "loss": 0.3751, "step": 103 }, { "epoch": 0.6380368098159509, "grad_norm": 0.09511607140302658, "learning_rate": 7.971005144858554e-06, "loss": 0.3808, "step": 104 }, { "epoch": 0.6441717791411042, "grad_norm": 0.07342271506786346, "learning_rate": 7.930877413373369e-06, "loss": 0.3106, "step": 105 }, { "epoch": 0.6503067484662577, "grad_norm": 0.07932678610086441, "learning_rate": 7.890460001124242e-06, "loss": 0.3932, "step": 106 }, { "epoch": 0.656441717791411, "grad_norm": 0.07940706610679626, "learning_rate": 7.849756902869471e-06, "loss": 0.4068, "step": 107 }, { "epoch": 0.6625766871165644, "grad_norm": 0.09030912816524506, "learning_rate": 7.808772141603855e-06, "loss": 0.431, "step": 108 }, { "epoch": 0.6687116564417178, "grad_norm": 0.091177798807621, "learning_rate": 7.767509768161079e-06, "loss": 0.4878, "step": 109 }, { "epoch": 0.6748466257668712, "grad_norm": 0.09804334491491318, "learning_rate": 7.725973860813338e-06, "loss": 0.4954, "step": 110 }, { "epoch": 0.6809815950920245, "grad_norm": 0.06426946073770523, "learning_rate": 7.684168524868253e-06, "loss": 0.2363, "step": 111 }, { "epoch": 0.6871165644171779, "grad_norm": 0.06771596521139145, "learning_rate": 7.642097892263098e-06, "loss": 0.3587, "step": 112 }, { "epoch": 0.6932515337423313, "grad_norm": 0.08947895467281342, "learning_rate": 7.599766121156436e-06, "loss": 0.3753, "step": 113 }, { "epoch": 0.6993865030674846, "grad_norm": 0.06724036484956741, "learning_rate": 7.5571773955171124e-06, "loss": 0.3099, "step": 114 }, { "epoch": 0.7055214723926381, "grad_norm": 0.08312314003705978, "learning_rate": 7.5143359247107314e-06, "loss": 0.3134, "step": 115 }, { "epoch": 0.7116564417177914, "grad_norm": 0.0804123803973198, "learning_rate": 7.471245943083615e-06, "loss": 0.3077, "step": 116 }, { "epoch": 0.7177914110429447, "grad_norm": 0.08391325175762177, "learning_rate": 7.427911709544288e-06, "loss": 0.416, "step": 117 }, { "epoch": 0.7239263803680982, "grad_norm": 0.067812480032444, "learning_rate": 7.3843375071425315e-06, "loss": 0.2535, "step": 118 }, { "epoch": 0.7300613496932515, "grad_norm": 0.103757344186306, "learning_rate": 7.340527642646069e-06, "loss": 0.5859, "step": 119 }, { "epoch": 0.7361963190184049, "grad_norm": 0.07385145872831345, "learning_rate": 7.2964864461148895e-06, "loss": 0.3527, "step": 120 }, { "epoch": 0.7423312883435583, "grad_norm": 0.08877512067556381, "learning_rate": 7.252218270473274e-06, "loss": 0.5138, "step": 121 }, { "epoch": 0.7484662576687117, "grad_norm": 0.06157367303967476, "learning_rate": 7.2077274910795605e-06, "loss": 0.2458, "step": 122 }, { "epoch": 0.754601226993865, "grad_norm": 0.17377746105194092, "learning_rate": 7.163018505293703e-06, "loss": 0.3547, "step": 123 }, { "epoch": 0.754601226993865, "eval_loss": 0.40320152044296265, "eval_runtime": 22.6808, "eval_samples_per_second": 8.113, "eval_steps_per_second": 1.014, "step": 123 }, { "epoch": 0.7607361963190185, "grad_norm": 0.08620224893093109, "learning_rate": 7.118095732042643e-06, "loss": 0.5084, "step": 124 }, { "epoch": 0.7668711656441718, "grad_norm": 0.07054693251848221, "learning_rate": 7.072963611383545e-06, "loss": 0.2391, "step": 125 }, { "epoch": 0.7730061349693251, "grad_norm": 0.1339060664176941, "learning_rate": 7.02762660406497e-06, "loss": 0.6351, "step": 126 }, { "epoch": 0.7791411042944786, "grad_norm": 0.07386178523302078, "learning_rate": 6.982089191085971e-06, "loss": 0.3047, "step": 127 }, { "epoch": 0.7852760736196319, "grad_norm": 0.10029296576976776, "learning_rate": 6.936355873253207e-06, "loss": 0.4328, "step": 128 }, { "epoch": 0.7914110429447853, "grad_norm": 0.09643431752920151, "learning_rate": 6.8904311707360914e-06, "loss": 0.4234, "step": 129 }, { "epoch": 0.7975460122699386, "grad_norm": 0.08629649877548218, "learning_rate": 6.844319622620039e-06, "loss": 0.3454, "step": 130 }, { "epoch": 0.803680981595092, "grad_norm": 0.09917541593313217, "learning_rate": 6.798025786457825e-06, "loss": 0.4828, "step": 131 }, { "epoch": 0.8098159509202454, "grad_norm": 0.08652715384960175, "learning_rate": 6.751554237819122e-06, "loss": 0.3921, "step": 132 }, { "epoch": 0.8159509202453987, "grad_norm": 0.08848355710506439, "learning_rate": 6.704909569838281e-06, "loss": 0.3573, "step": 133 }, { "epoch": 0.8220858895705522, "grad_norm": 0.09000125527381897, "learning_rate": 6.65809639276034e-06, "loss": 0.4486, "step": 134 }, { "epoch": 0.8282208588957055, "grad_norm": 0.08975492417812347, "learning_rate": 6.611119333485364e-06, "loss": 0.3867, "step": 135 }, { "epoch": 0.8343558282208589, "grad_norm": 0.0796804279088974, "learning_rate": 6.563983035111136e-06, "loss": 0.381, "step": 136 }, { "epoch": 0.8404907975460123, "grad_norm": 0.08723866939544678, "learning_rate": 6.516692156474243e-06, "loss": 0.4316, "step": 137 }, { "epoch": 0.8466257668711656, "grad_norm": 0.11449507623910904, "learning_rate": 6.469251371689606e-06, "loss": 0.5042, "step": 138 }, { "epoch": 0.852760736196319, "grad_norm": 0.127399280667305, "learning_rate": 6.421665369688501e-06, "loss": 0.56, "step": 139 }, { "epoch": 0.8588957055214724, "grad_norm": 0.0761680155992508, "learning_rate": 6.373938853755126e-06, "loss": 0.2881, "step": 140 }, { "epoch": 0.8650306748466258, "grad_norm": 0.08487329632043839, "learning_rate": 6.326076541061729e-06, "loss": 0.3171, "step": 141 }, { "epoch": 0.8711656441717791, "grad_norm": 0.09721650928258896, "learning_rate": 6.278083162202374e-06, "loss": 0.4289, "step": 142 }, { "epoch": 0.8773006134969326, "grad_norm": 0.08022255450487137, "learning_rate": 6.22996346072539e-06, "loss": 0.327, "step": 143 }, { "epoch": 0.8834355828220859, "grad_norm": 0.11367864906787872, "learning_rate": 6.181722192664526e-06, "loss": 0.4132, "step": 144 }, { "epoch": 0.8895705521472392, "grad_norm": 0.08727509528398514, "learning_rate": 6.133364126068867e-06, "loss": 0.3467, "step": 145 }, { "epoch": 0.8957055214723927, "grad_norm": 0.09527825564146042, "learning_rate": 6.084894040531591e-06, "loss": 0.5279, "step": 146 }, { "epoch": 0.901840490797546, "grad_norm": 0.07519163191318512, "learning_rate": 6.036316726717546e-06, "loss": 0.3221, "step": 147 }, { "epoch": 0.9079754601226994, "grad_norm": 0.08815211802721024, "learning_rate": 5.987636985889764e-06, "loss": 0.347, "step": 148 }, { "epoch": 0.9141104294478528, "grad_norm": 0.10302853584289551, "learning_rate": 5.938859629434913e-06, "loss": 0.4378, "step": 149 }, { "epoch": 0.9202453987730062, "grad_norm": 0.09466961026191711, "learning_rate": 5.8899894783877536e-06, "loss": 0.3053, "step": 150 }, { "epoch": 0.9263803680981595, "grad_norm": 0.09226138889789581, "learning_rate": 5.841031362954629e-06, "loss": 0.4522, "step": 151 }, { "epoch": 0.9325153374233128, "grad_norm": 0.10703295469284058, "learning_rate": 5.791990122036075e-06, "loss": 0.4592, "step": 152 }, { "epoch": 0.9386503067484663, "grad_norm": 0.1230417788028717, "learning_rate": 5.742870602748547e-06, "loss": 0.5402, "step": 153 }, { "epoch": 0.9447852760736196, "grad_norm": 0.09710447490215302, "learning_rate": 5.693677659945343e-06, "loss": 0.4628, "step": 154 }, { "epoch": 0.950920245398773, "grad_norm": 0.11172331869602203, "learning_rate": 5.6444161557367534e-06, "loss": 0.3569, "step": 155 }, { "epoch": 0.9570552147239264, "grad_norm": 0.09823194146156311, "learning_rate": 5.595090959009525e-06, "loss": 0.4716, "step": 156 }, { "epoch": 0.9631901840490797, "grad_norm": 0.08101484179496765, "learning_rate": 5.5457069449456055e-06, "loss": 0.4065, "step": 157 }, { "epoch": 0.9693251533742331, "grad_norm": 0.08857468515634537, "learning_rate": 5.496268994540309e-06, "loss": 0.3606, "step": 158 }, { "epoch": 0.9754601226993865, "grad_norm": 0.07157395780086517, "learning_rate": 5.446781994119886e-06, "loss": 0.2453, "step": 159 }, { "epoch": 0.9815950920245399, "grad_norm": 0.07760554552078247, "learning_rate": 5.397250834858573e-06, "loss": 0.364, "step": 160 }, { "epoch": 0.9877300613496932, "grad_norm": 0.07792805880308151, "learning_rate": 5.347680412295152e-06, "loss": 0.3496, "step": 161 }, { "epoch": 0.9938650306748467, "grad_norm": 0.08258286118507385, "learning_rate": 5.2980756258491e-06, "loss": 0.3455, "step": 162 }, { "epoch": 1.0, "grad_norm": 0.11803517490625381, "learning_rate": 5.2484413783363335e-06, "loss": 0.4871, "step": 163 }, { "epoch": 1.0061349693251533, "grad_norm": 0.10544019937515259, "learning_rate": 5.19878257548463e-06, "loss": 0.4369, "step": 164 }, { "epoch": 1.0061349693251533, "eval_loss": 0.39176106452941895, "eval_runtime": 22.8177, "eval_samples_per_second": 8.064, "eval_steps_per_second": 1.008, "step": 164 }, { "epoch": 1.0122699386503067, "grad_norm": 0.09844586253166199, "learning_rate": 5.149104125448752e-06, "loss": 0.4385, "step": 165 }, { "epoch": 1.01840490797546, "grad_norm": 0.07265810668468475, "learning_rate": 5.099410938325351e-06, "loss": 0.2595, "step": 166 }, { "epoch": 1.0245398773006136, "grad_norm": 0.12025801837444305, "learning_rate": 5.04970792566765e-06, "loss": 0.5587, "step": 167 }, { "epoch": 1.030674846625767, "grad_norm": 0.1306670904159546, "learning_rate": 5e-06, "loss": 0.603, "step": 168 }, { "epoch": 1.0368098159509203, "grad_norm": 0.10355333238840103, "learning_rate": 4.9502920743323525e-06, "loss": 0.3555, "step": 169 }, { "epoch": 1.0061349693251533, "grad_norm": 0.10698702186346054, "learning_rate": 4.900589061674649e-06, "loss": 0.4452, "step": 170 }, { "epoch": 1.0122699386503067, "grad_norm": 0.0719093456864357, "learning_rate": 4.850895874551248e-06, "loss": 0.3209, "step": 171 }, { "epoch": 1.01840490797546, "grad_norm": 0.07391523569822311, "learning_rate": 4.801217424515373e-06, "loss": 0.2961, "step": 172 }, { "epoch": 1.0245398773006136, "grad_norm": 0.07925672829151154, "learning_rate": 4.751558621663668e-06, "loss": 0.3446, "step": 173 }, { "epoch": 1.030674846625767, "grad_norm": 0.10210183262825012, "learning_rate": 4.701924374150901e-06, "loss": 0.49, "step": 174 }, { "epoch": 1.0368098159509203, "grad_norm": 0.11506979912519455, "learning_rate": 4.6523195877048495e-06, "loss": 0.2977, "step": 175 }, { "epoch": 1.0429447852760736, "grad_norm": 0.09373599290847778, "learning_rate": 4.602749165141429e-06, "loss": 0.3853, "step": 176 }, { "epoch": 1.049079754601227, "grad_norm": 0.09830620139837265, "learning_rate": 4.5532180058801145e-06, "loss": 0.4363, "step": 177 }, { "epoch": 1.0552147239263803, "grad_norm": 0.08626256883144379, "learning_rate": 4.5037310054596936e-06, "loss": 0.4265, "step": 178 }, { "epoch": 1.0613496932515338, "grad_norm": 0.11579606682062149, "learning_rate": 4.454293055054397e-06, "loss": 0.5565, "step": 179 }, { "epoch": 1.0674846625766872, "grad_norm": 0.10661870241165161, "learning_rate": 4.404909040990477e-06, "loss": 0.5136, "step": 180 }, { "epoch": 1.0736196319018405, "grad_norm": 0.09875867515802383, "learning_rate": 4.355583844263247e-06, "loss": 0.5337, "step": 181 }, { "epoch": 1.0797546012269938, "grad_norm": 0.2183881402015686, "learning_rate": 4.30632234005466e-06, "loss": 0.3483, "step": 182 }, { "epoch": 1.0858895705521472, "grad_norm": 0.0771089717745781, "learning_rate": 4.257129397251453e-06, "loss": 0.3027, "step": 183 }, { "epoch": 1.0920245398773005, "grad_norm": 0.10682545602321625, "learning_rate": 4.2080098779639255e-06, "loss": 0.4408, "step": 184 }, { "epoch": 1.098159509202454, "grad_norm": 0.08865021914243698, "learning_rate": 4.158968637045374e-06, "loss": 0.4054, "step": 185 }, { "epoch": 1.1042944785276074, "grad_norm": 0.11121812462806702, "learning_rate": 4.11001052161225e-06, "loss": 0.3993, "step": 186 }, { "epoch": 1.1104294478527608, "grad_norm": 0.10253646969795227, "learning_rate": 4.061140370565088e-06, "loss": 0.4388, "step": 187 }, { "epoch": 1.116564417177914, "grad_norm": 0.09401492774486542, "learning_rate": 4.012363014110237e-06, "loss": 0.4846, "step": 188 }, { "epoch": 1.1226993865030674, "grad_norm": 0.1021651104092598, "learning_rate": 3.9636832732824555e-06, "loss": 0.312, "step": 189 }, { "epoch": 1.1288343558282208, "grad_norm": 0.12308547645807266, "learning_rate": 3.91510595946841e-06, "loss": 0.5238, "step": 190 }, { "epoch": 1.1349693251533743, "grad_norm": 0.1113848015666008, "learning_rate": 3.866635873931133e-06, "loss": 0.4818, "step": 191 }, { "epoch": 1.1411042944785277, "grad_norm": 0.10525127500295639, "learning_rate": 3.818277807335477e-06, "loss": 0.381, "step": 192 }, { "epoch": 1.147239263803681, "grad_norm": 0.12201809883117676, "learning_rate": 3.7700365392746106e-06, "loss": 0.3412, "step": 193 }, { "epoch": 1.1533742331288344, "grad_norm": 0.08116313070058823, "learning_rate": 3.721916837797627e-06, "loss": 0.3486, "step": 194 }, { "epoch": 1.1595092024539877, "grad_norm": 0.11550577729940414, "learning_rate": 3.6739234589382722e-06, "loss": 0.3337, "step": 195 }, { "epoch": 1.165644171779141, "grad_norm": 0.08885617554187775, "learning_rate": 3.6260611462448736e-06, "loss": 0.27, "step": 196 }, { "epoch": 1.1717791411042944, "grad_norm": 0.1086370199918747, "learning_rate": 3.5783346303114986e-06, "loss": 0.3044, "step": 197 }, { "epoch": 1.177914110429448, "grad_norm": 0.10726229846477509, "learning_rate": 3.5307486283103966e-06, "loss": 0.4503, "step": 198 }, { "epoch": 1.1840490797546013, "grad_norm": 0.0863179937005043, "learning_rate": 3.4833078435257584e-06, "loss": 0.2563, "step": 199 }, { "epoch": 1.1901840490797546, "grad_norm": 0.14450977742671967, "learning_rate": 3.4360169648888653e-06, "loss": 0.5502, "step": 200 }, { "epoch": 1.196319018404908, "grad_norm": 0.10789606720209122, "learning_rate": 3.388880666514637e-06, "loss": 0.3767, "step": 201 }, { "epoch": 1.2024539877300613, "grad_norm": 0.11021628230810165, "learning_rate": 3.3419036072396614e-06, "loss": 0.4526, "step": 202 }, { "epoch": 1.2085889570552146, "grad_norm": 0.10493913292884827, "learning_rate": 3.29509043016172e-06, "loss": 0.3998, "step": 203 }, { "epoch": 1.2147239263803682, "grad_norm": 0.12165309488773346, "learning_rate": 3.2484457621808787e-06, "loss": 0.383, "step": 204 }, { "epoch": 1.2208588957055215, "grad_norm": 0.09394080936908722, "learning_rate": 3.201974213542178e-06, "loss": 0.4129, "step": 205 }, { "epoch": 1.2208588957055215, "eval_loss": 0.38451722264289856, "eval_runtime": 22.6218, "eval_samples_per_second": 8.134, "eval_steps_per_second": 1.017, "step": 205 }, { "epoch": 1.2269938650306749, "grad_norm": 0.09802668541669846, "learning_rate": 3.1556803773799616e-06, "loss": 0.408, "step": 206 }, { "epoch": 1.2331288343558282, "grad_norm": 0.13217565417289734, "learning_rate": 3.1095688292639094e-06, "loss": 0.3198, "step": 207 }, { "epoch": 1.2392638036809815, "grad_norm": 0.10313939303159714, "learning_rate": 3.0636441267467955e-06, "loss": 0.3128, "step": 208 }, { "epoch": 1.2453987730061349, "grad_norm": 0.0762137621641159, "learning_rate": 3.01791080891403e-06, "loss": 0.2638, "step": 209 }, { "epoch": 1.2515337423312882, "grad_norm": 0.11147941648960114, "learning_rate": 2.972373395935031e-06, "loss": 0.4534, "step": 210 }, { "epoch": 1.2576687116564418, "grad_norm": 0.09862508624792099, "learning_rate": 2.927036388616457e-06, "loss": 0.4292, "step": 211 }, { "epoch": 1.2638036809815951, "grad_norm": 0.09868728369474411, "learning_rate": 2.8819042679573618e-06, "loss": 0.3276, "step": 212 }, { "epoch": 1.2699386503067485, "grad_norm": 0.08501134812831879, "learning_rate": 2.8369814947062994e-06, "loss": 0.3095, "step": 213 }, { "epoch": 1.2760736196319018, "grad_norm": 0.11438091844320297, "learning_rate": 2.792272508920443e-06, "loss": 0.295, "step": 214 }, { "epoch": 1.2822085889570551, "grad_norm": 0.11010921746492386, "learning_rate": 2.7477817295267273e-06, "loss": 0.433, "step": 215 }, { "epoch": 1.2883435582822087, "grad_norm": 0.08820465952157974, "learning_rate": 2.70351355388511e-06, "loss": 0.3045, "step": 216 }, { "epoch": 1.294478527607362, "grad_norm": 0.08259084820747375, "learning_rate": 2.6594723573539307e-06, "loss": 0.3812, "step": 217 }, { "epoch": 1.3006134969325154, "grad_norm": 0.11154909431934357, "learning_rate": 2.615662492857471e-06, "loss": 0.3513, "step": 218 }, { "epoch": 1.3067484662576687, "grad_norm": 0.0986664667725563, "learning_rate": 2.5720882904557156e-06, "loss": 0.3658, "step": 219 }, { "epoch": 1.312883435582822, "grad_norm": 0.2315550446510315, "learning_rate": 2.528754056916386e-06, "loss": 0.5219, "step": 220 }, { "epoch": 1.3190184049079754, "grad_norm": 0.10897372663021088, "learning_rate": 2.4856640752892702e-06, "loss": 0.4178, "step": 221 }, { "epoch": 1.3251533742331287, "grad_norm": 0.14039960503578186, "learning_rate": 2.4428226044828896e-06, "loss": 0.4113, "step": 222 }, { "epoch": 1.331288343558282, "grad_norm": 0.10824128240346909, "learning_rate": 2.4002338788435654e-06, "loss": 0.5079, "step": 223 }, { "epoch": 1.3374233128834356, "grad_norm": 0.10082822293043137, "learning_rate": 2.3579021077369047e-06, "loss": 0.3356, "step": 224 }, { "epoch": 1.343558282208589, "grad_norm": 0.1202516257762909, "learning_rate": 2.315831475131751e-06, "loss": 0.3236, "step": 225 }, { "epoch": 1.3496932515337423, "grad_norm": 0.1029471606016159, "learning_rate": 2.2740261391866634e-06, "loss": 0.3491, "step": 226 }, { "epoch": 1.3558282208588956, "grad_norm": 0.12975585460662842, "learning_rate": 2.232490231838923e-06, "loss": 0.402, "step": 227 }, { "epoch": 1.3619631901840492, "grad_norm": 0.12482481449842453, "learning_rate": 2.1912278583961454e-06, "loss": 0.4321, "step": 228 }, { "epoch": 1.3680981595092025, "grad_norm": 0.1507352739572525, "learning_rate": 2.1502430971305288e-06, "loss": 0.5751, "step": 229 }, { "epoch": 1.3742331288343559, "grad_norm": 0.0954100489616394, "learning_rate": 2.1095399988757574e-06, "loss": 0.3933, "step": 230 }, { "epoch": 1.3803680981595092, "grad_norm": 0.10602657496929169, "learning_rate": 2.0691225866266335e-06, "loss": 0.4803, "step": 231 }, { "epoch": 1.3865030674846626, "grad_norm": 0.11936960369348526, "learning_rate": 2.0289948551414486e-06, "loss": 0.3983, "step": 232 }, { "epoch": 1.392638036809816, "grad_norm": 0.08489519357681274, "learning_rate": 1.989160770547159e-06, "loss": 0.2552, "step": 233 }, { "epoch": 1.3987730061349692, "grad_norm": 0.10064487159252167, "learning_rate": 1.949624269947378e-06, "loss": 0.4482, "step": 234 }, { "epoch": 1.4049079754601226, "grad_norm": 0.10288871824741364, "learning_rate": 1.9103892610332467e-06, "loss": 0.3787, "step": 235 }, { "epoch": 1.4110429447852761, "grad_norm": 0.09945371747016907, "learning_rate": 1.8714596216972008e-06, "loss": 0.3845, "step": 236 }, { "epoch": 1.4171779141104295, "grad_norm": 0.11584262549877167, "learning_rate": 1.8328391996496942e-06, "loss": 0.5336, "step": 237 }, { "epoch": 1.4233128834355828, "grad_norm": 0.08862798660993576, "learning_rate": 1.794531812038901e-06, "loss": 0.3253, "step": 238 }, { "epoch": 1.4294478527607362, "grad_norm": 0.12273416668176651, "learning_rate": 1.756541245073432e-06, "loss": 0.4336, "step": 239 }, { "epoch": 1.4355828220858895, "grad_norm": 0.08479359745979309, "learning_rate": 1.7188712536481233e-06, "loss": 0.3385, "step": 240 }, { "epoch": 1.441717791411043, "grad_norm": 0.08785238116979599, "learning_rate": 1.6815255609729047e-06, "loss": 0.3856, "step": 241 }, { "epoch": 1.4478527607361964, "grad_norm": 0.11194564402103424, "learning_rate": 1.6445078582048158e-06, "loss": 0.4059, "step": 242 }, { "epoch": 1.4539877300613497, "grad_norm": 0.10265929996967316, "learning_rate": 1.6078218040831678e-06, "loss": 0.5095, "step": 243 }, { "epoch": 1.460122699386503, "grad_norm": 0.1036793664097786, "learning_rate": 1.5714710245679348e-06, "loss": 0.3271, "step": 244 }, { "epoch": 1.4662576687116564, "grad_norm": 0.16661570966243744, "learning_rate": 1.5354591124813628e-06, "loss": 0.4891, "step": 245 }, { "epoch": 1.4723926380368098, "grad_norm": 0.08812650293111801, "learning_rate": 1.499789627152874e-06, "loss": 0.335, "step": 246 }, { "epoch": 1.4723926380368098, "eval_loss": 0.3808572590351105, "eval_runtime": 22.6328, "eval_samples_per_second": 8.13, "eval_steps_per_second": 1.016, "step": 246 }, { "epoch": 1.478527607361963, "grad_norm": 0.1271064132452011, "learning_rate": 1.4644660940672628e-06, "loss": 0.4646, "step": 247 }, { "epoch": 1.4846625766871164, "grad_norm": 0.10755060613155365, "learning_rate": 1.4294920045162514e-06, "loss": 0.4369, "step": 248 }, { "epoch": 1.49079754601227, "grad_norm": 0.10873299837112427, "learning_rate": 1.3948708152534163e-06, "loss": 0.3521, "step": 249 }, { "epoch": 1.4969325153374233, "grad_norm": 0.15240046381950378, "learning_rate": 1.3606059481525296e-06, "loss": 0.4, "step": 250 }, { "epoch": 1.5030674846625767, "grad_norm": 0.158911794424057, "learning_rate": 1.3267007898693552e-06, "loss": 0.3103, "step": 251 }, { "epoch": 1.50920245398773, "grad_norm": 0.11236365884542465, "learning_rate": 1.2931586915069106e-06, "loss": 0.3715, "step": 252 }, { "epoch": 1.5153374233128836, "grad_norm": 0.1483219861984253, "learning_rate": 1.2599829682842618e-06, "loss": 0.4046, "step": 253 }, { "epoch": 1.521472392638037, "grad_norm": 0.09588169306516647, "learning_rate": 1.227176899208849e-06, "loss": 0.3077, "step": 254 }, { "epoch": 1.5276073619631902, "grad_norm": 0.10631563514471054, "learning_rate": 1.194743726752403e-06, "loss": 0.4418, "step": 255 }, { "epoch": 1.5337423312883436, "grad_norm": 0.10569003224372864, "learning_rate": 1.1626866565304594e-06, "loss": 0.4616, "step": 256 }, { "epoch": 1.539877300613497, "grad_norm": 0.11209447681903839, "learning_rate": 1.1310088569855315e-06, "loss": 0.4556, "step": 257 }, { "epoch": 1.5460122699386503, "grad_norm": 0.14903852343559265, "learning_rate": 1.09971345907394e-06, "loss": 0.3381, "step": 258 }, { "epoch": 1.5521472392638036, "grad_norm": 0.10578689724206924, "learning_rate": 1.068803555956367e-06, "loss": 0.3798, "step": 259 }, { "epoch": 1.558282208588957, "grad_norm": 0.12829791009426117, "learning_rate": 1.0382822026921291e-06, "loss": 0.5217, "step": 260 }, { "epoch": 1.5644171779141103, "grad_norm": 0.12076660990715027, "learning_rate": 1.0081524159372246e-06, "loss": 0.4557, "step": 261 }, { "epoch": 1.5705521472392638, "grad_norm": 0.11094032227993011, "learning_rate": 9.784171736461762e-07, "loss": 0.3339, "step": 262 }, { "epoch": 1.5766871165644172, "grad_norm": 0.09728420525789261, "learning_rate": 9.490794147776927e-07, "loss": 0.3506, "step": 263 }, { "epoch": 1.5828220858895705, "grad_norm": 0.1307501345872879, "learning_rate": 9.201420390041965e-07, "loss": 0.3652, "step": 264 }, { "epoch": 1.588957055214724, "grad_norm": 0.11151021718978882, "learning_rate": 8.916079064252164e-07, "loss": 0.3796, "step": 265 }, { "epoch": 1.5950920245398774, "grad_norm": 0.091631218791008, "learning_rate": 8.634798372847148e-07, "loss": 0.2796, "step": 266 }, { "epoch": 1.6012269938650308, "grad_norm": 0.10448212176561356, "learning_rate": 8.357606116923328e-07, "loss": 0.2626, "step": 267 }, { "epoch": 1.607361963190184, "grad_norm": 0.11236970871686935, "learning_rate": 8.084529693486171e-07, "loss": 0.3601, "step": 268 }, { "epoch": 1.6134969325153374, "grad_norm": 0.11304070055484772, "learning_rate": 7.815596092742278e-07, "loss": 0.3641, "step": 269 }, { "epoch": 1.6196319018404908, "grad_norm": 0.4070085883140564, "learning_rate": 7.550831895431799e-07, "loss": 0.4316, "step": 270 }, { "epoch": 1.6257668711656441, "grad_norm": 0.10678707808256149, "learning_rate": 7.290263270201231e-07, "loss": 0.4281, "step": 271 }, { "epoch": 1.6319018404907975, "grad_norm": 0.10431778430938721, "learning_rate": 7.033915971016952e-07, "loss": 0.3907, "step": 272 }, { "epoch": 1.6380368098159508, "grad_norm": 0.10240163654088974, "learning_rate": 6.781815334619812e-07, "loss": 0.3344, "step": 273 }, { "epoch": 1.6441717791411041, "grad_norm": 0.10372646898031235, "learning_rate": 6.533986278020876e-07, "loss": 0.373, "step": 274 }, { "epoch": 1.6503067484662577, "grad_norm": 0.12202878296375275, "learning_rate": 6.290453296038702e-07, "loss": 0.4087, "step": 275 }, { "epoch": 1.656441717791411, "grad_norm": 0.09939797222614288, "learning_rate": 6.051240458878316e-07, "loss": 0.3611, "step": 276 }, { "epoch": 1.6625766871165644, "grad_norm": 0.12352439016103745, "learning_rate": 5.816371409752203e-07, "loss": 0.5242, "step": 277 }, { "epoch": 1.668711656441718, "grad_norm": 0.11458908766508102, "learning_rate": 5.585869362543416e-07, "loss": 0.4827, "step": 278 }, { "epoch": 1.6748466257668713, "grad_norm": 0.14145226776599884, "learning_rate": 5.359757099511237e-07, "loss": 0.3911, "step": 279 }, { "epoch": 1.6809815950920246, "grad_norm": 0.09614825993776321, "learning_rate": 5.138056969039384e-07, "loss": 0.3729, "step": 280 }, { "epoch": 1.687116564417178, "grad_norm": 0.10174663364887238, "learning_rate": 4.920790883427201e-07, "loss": 0.4666, "step": 281 }, { "epoch": 1.6932515337423313, "grad_norm": 0.09838556498289108, "learning_rate": 4.707980316723837e-07, "loss": 0.4527, "step": 282 }, { "epoch": 1.6993865030674846, "grad_norm": 0.09283680468797684, "learning_rate": 4.4996463026058476e-07, "loss": 0.3644, "step": 283 }, { "epoch": 1.705521472392638, "grad_norm": 0.1122664213180542, "learning_rate": 4.2958094322982703e-07, "loss": 0.344, "step": 284 }, { "epoch": 1.7116564417177913, "grad_norm": 0.15890415012836456, "learning_rate": 4.096489852539426e-07, "loss": 0.647, "step": 285 }, { "epoch": 1.7177914110429446, "grad_norm": 0.13360188901424408, "learning_rate": 3.9017072635896716e-07, "loss": 0.5654, "step": 286 }, { "epoch": 1.7239263803680982, "grad_norm": 0.10913598537445068, "learning_rate": 3.7114809172842827e-07, "loss": 0.3842, "step": 287 }, { "epoch": 1.7239263803680982, "eval_loss": 0.379504919052124, "eval_runtime": 22.6056, "eval_samples_per_second": 8.14, "eval_steps_per_second": 1.017, "step": 287 }, { "epoch": 1.7300613496932515, "grad_norm": 0.09505796432495117, "learning_rate": 3.5258296151306495e-07, "loss": 0.2472, "step": 288 }, { "epoch": 1.7361963190184049, "grad_norm": 0.10274416208267212, "learning_rate": 3.3447717064499565e-07, "loss": 0.4665, "step": 289 }, { "epoch": 1.7423312883435584, "grad_norm": 0.11168382316827774, "learning_rate": 3.168325086563612e-07, "loss": 0.4719, "step": 290 }, { "epoch": 1.7484662576687118, "grad_norm": 0.121711865067482, "learning_rate": 2.996507195024495e-07, "loss": 0.3143, "step": 291 }, { "epoch": 1.7546012269938651, "grad_norm": 0.0906308963894844, "learning_rate": 2.8293350138932805e-07, "loss": 0.4019, "step": 292 }, { "epoch": 1.7607361963190185, "grad_norm": 0.12006894499063492, "learning_rate": 2.666825066059986e-07, "loss": 0.3327, "step": 293 }, { "epoch": 1.7668711656441718, "grad_norm": 0.12550675868988037, "learning_rate": 2.5089934136108665e-07, "loss": 0.4567, "step": 294 }, { "epoch": 1.7730061349693251, "grad_norm": 0.11931753158569336, "learning_rate": 2.3558556562409074e-07, "loss": 0.5051, "step": 295 }, { "epoch": 1.7791411042944785, "grad_norm": 0.13427190482616425, "learning_rate": 2.2074269297119588e-07, "loss": 0.4541, "step": 296 }, { "epoch": 1.7852760736196318, "grad_norm": 0.09217509627342224, "learning_rate": 2.0637219043567636e-07, "loss": 0.3891, "step": 297 }, { "epoch": 1.7914110429447851, "grad_norm": 0.11418317258358002, "learning_rate": 1.9247547836289792e-07, "loss": 0.3991, "step": 298 }, { "epoch": 1.7975460122699385, "grad_norm": 0.09878183901309967, "learning_rate": 1.7905393026993513e-07, "loss": 0.3447, "step": 299 }, { "epoch": 1.803680981595092, "grad_norm": 0.10060778260231018, "learning_rate": 1.6610887270981425e-07, "loss": 0.3269, "step": 300 }, { "epoch": 1.8098159509202454, "grad_norm": 0.09531107544898987, "learning_rate": 1.5364158514040328e-07, "loss": 0.3163, "step": 301 }, { "epoch": 1.8159509202453987, "grad_norm": 0.11034092307090759, "learning_rate": 1.4165329979794972e-07, "loss": 0.349, "step": 302 }, { "epoch": 1.8220858895705523, "grad_norm": 0.10453791916370392, "learning_rate": 1.3014520157529244e-07, "loss": 0.3591, "step": 303 }, { "epoch": 1.8282208588957056, "grad_norm": 0.11782266944646835, "learning_rate": 1.1911842790474637e-07, "loss": 0.4248, "step": 304 }, { "epoch": 1.834355828220859, "grad_norm": 0.13830620050430298, "learning_rate": 1.0857406864568488e-07, "loss": 0.5286, "step": 305 }, { "epoch": 1.8404907975460123, "grad_norm": 0.08683877438306808, "learning_rate": 9.851316597681959e-08, "loss": 0.2167, "step": 306 }, { "epoch": 1.8466257668711656, "grad_norm": 0.1016291156411171, "learning_rate": 8.893671429319294e-08, "loss": 0.3983, "step": 307 }, { "epoch": 1.852760736196319, "grad_norm": 0.1077284961938858, "learning_rate": 7.984566010789673e-08, "loss": 0.3073, "step": 308 }, { "epoch": 1.8588957055214723, "grad_norm": 0.08103854209184647, "learning_rate": 7.124090195851807e-08, "loss": 0.3116, "step": 309 }, { "epoch": 1.8650306748466257, "grad_norm": 0.11006593704223633, "learning_rate": 6.31232903183332e-08, "loss": 0.4318, "step": 310 }, { "epoch": 1.871165644171779, "grad_norm": 0.09934885799884796, "learning_rate": 5.549362751224585e-08, "loss": 0.3649, "step": 311 }, { "epoch": 1.8773006134969326, "grad_norm": 0.11354987323284149, "learning_rate": 4.8352667637490694e-08, "loss": 0.4092, "step": 312 }, { "epoch": 1.883435582822086, "grad_norm": 0.10538670420646667, "learning_rate": 4.170111648909736e-08, "loss": 0.3204, "step": 313 }, { "epoch": 1.8895705521472392, "grad_norm": 0.11542137712240219, "learning_rate": 3.553963149013295e-08, "loss": 0.4189, "step": 314 }, { "epoch": 1.8957055214723928, "grad_norm": 0.11087686568498611, "learning_rate": 2.986882162672344e-08, "loss": 0.4718, "step": 315 }, { "epoch": 1.9018404907975461, "grad_norm": 0.11411860585212708, "learning_rate": 2.4689247387862934e-08, "loss": 0.3454, "step": 316 }, { "epoch": 1.9079754601226995, "grad_norm": 0.08472780138254166, "learning_rate": 2.000142071001632e-08, "loss": 0.2456, "step": 317 }, { "epoch": 1.9141104294478528, "grad_norm": 0.10252390056848526, "learning_rate": 1.580580492652084e-08, "loss": 0.4087, "step": 318 }, { "epoch": 1.9202453987730062, "grad_norm": 0.12601801753044128, "learning_rate": 1.2102814721791645e-08, "loss": 0.4071, "step": 319 }, { "epoch": 1.9263803680981595, "grad_norm": 0.08735030889511108, "learning_rate": 8.8928160903351e-09, "loss": 0.3744, "step": 320 }, { "epoch": 1.9325153374233128, "grad_norm": 0.10152660310268402, "learning_rate": 6.176126300573848e-09, "loss": 0.4182, "step": 321 }, { "epoch": 1.9386503067484662, "grad_norm": 0.11482395976781845, "learning_rate": 3.953013863490784e-09, "loss": 0.3801, "step": 322 }, { "epoch": 1.9447852760736195, "grad_norm": 0.09311243146657944, "learning_rate": 2.223698506088612e-09, "loss": 0.2806, "step": 323 }, { "epoch": 1.9509202453987728, "grad_norm": 0.1334279328584671, "learning_rate": 9.883511496722176e-10, "loss": 0.5107, "step": 324 }, { "epoch": 1.9570552147239264, "grad_norm": 0.09790550917387009, "learning_rate": 2.470938929571842e-10, "loss": 0.2982, "step": 325 }, { "epoch": 1.9631901840490797, "grad_norm": 0.11607832461595535, "learning_rate": 0.0, "loss": 0.3971, "step": 326 } ], "logging_steps": 1, "max_steps": 326, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 82, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8033259439603057e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }