{ "best_metric": 0.20296970009803772, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 2.0106331561140647, "eval_steps": 25, "global_step": 130, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015466408893185114, "grad_norm": 0.34002751111984253, "learning_rate": 2.9999999999999997e-05, "loss": 0.4411, "step": 1 }, { "epoch": 0.015466408893185114, "eval_loss": 1.1426082849502563, "eval_runtime": 7.1707, "eval_samples_per_second": 6.973, "eval_steps_per_second": 6.973, "step": 1 }, { "epoch": 0.030932817786370227, "grad_norm": 0.4139275550842285, "learning_rate": 5.9999999999999995e-05, "loss": 0.4555, "step": 2 }, { "epoch": 0.04639922667955534, "grad_norm": 0.481924831867218, "learning_rate": 8.999999999999999e-05, "loss": 0.424, "step": 3 }, { "epoch": 0.061865635572740454, "grad_norm": 0.4299992322921753, "learning_rate": 0.00011999999999999999, "loss": 0.3855, "step": 4 }, { "epoch": 0.07733204446592556, "grad_norm": 0.42819246649742126, "learning_rate": 0.00015, "loss": 0.4611, "step": 5 }, { "epoch": 0.09279845335911067, "grad_norm": 0.9149371385574341, "learning_rate": 0.00017999999999999998, "loss": 0.7301, "step": 6 }, { "epoch": 0.1082648622522958, "grad_norm": 1.1431907415390015, "learning_rate": 0.00020999999999999998, "loss": 0.7905, "step": 7 }, { "epoch": 0.12373127114548091, "grad_norm": 1.802942156791687, "learning_rate": 0.00023999999999999998, "loss": 0.6581, "step": 8 }, { "epoch": 0.13919768003866603, "grad_norm": 2.6315770149230957, "learning_rate": 0.00027, "loss": 0.6625, "step": 9 }, { "epoch": 0.15466408893185113, "grad_norm": 1.5995557308197021, "learning_rate": 0.0003, "loss": 0.5518, "step": 10 }, { "epoch": 0.17013049782503625, "grad_norm": 1.1412254571914673, "learning_rate": 0.00029994859874633357, "loss": 0.4248, "step": 11 }, { "epoch": 0.18559690671822135, "grad_norm": 1.1108283996582031, "learning_rate": 0.000299794430213186, "loss": 0.4044, "step": 12 }, { "epoch": 0.20106331561140647, "grad_norm": 1.4058749675750732, "learning_rate": 0.00029953760005996916, "loss": 0.4258, "step": 13 }, { "epoch": 0.2165297245045916, "grad_norm": 0.8541085124015808, "learning_rate": 0.00029917828430524096, "loss": 0.419, "step": 14 }, { "epoch": 0.2319961333977767, "grad_norm": 0.7577608823776245, "learning_rate": 0.00029871672920607153, "loss": 0.4055, "step": 15 }, { "epoch": 0.24746254229096182, "grad_norm": 0.8010592460632324, "learning_rate": 0.00029815325108927063, "loss": 0.4587, "step": 16 }, { "epoch": 0.26292895118414694, "grad_norm": 2.178513526916504, "learning_rate": 0.00029748823613459316, "loss": 0.3844, "step": 17 }, { "epoch": 0.27839536007733207, "grad_norm": 1.049085021018982, "learning_rate": 0.0002967221401100708, "loss": 0.3085, "step": 18 }, { "epoch": 0.29386176897051713, "grad_norm": 0.2664034962654114, "learning_rate": 0.0002958554880596515, "loss": 0.2314, "step": 19 }, { "epoch": 0.30932817786370226, "grad_norm": 0.9501022696495056, "learning_rate": 0.0002948888739433602, "loss": 0.2162, "step": 20 }, { "epoch": 0.3247945867568874, "grad_norm": 0.29297515749931335, "learning_rate": 0.00029382296023022894, "loss": 0.1734, "step": 21 }, { "epoch": 0.3402609956500725, "grad_norm": 0.29325342178344727, "learning_rate": 0.00029265847744427303, "loss": 0.2472, "step": 22 }, { "epoch": 0.35572740454325763, "grad_norm": 0.7397837042808533, "learning_rate": 0.00029139622366382674, "loss": 0.3872, "step": 23 }, { "epoch": 0.3711938134364427, "grad_norm": 0.5978482365608215, "learning_rate": 0.00029003706397458023, "loss": 0.3756, "step": 24 }, { "epoch": 0.3866602223296278, "grad_norm": 0.8143917918205261, "learning_rate": 0.000288581929876693, "loss": 0.3305, "step": 25 }, { "epoch": 0.3866602223296278, "eval_loss": 0.2919376492500305, "eval_runtime": 7.3131, "eval_samples_per_second": 6.837, "eval_steps_per_second": 6.837, "step": 25 }, { "epoch": 0.40212663122281295, "grad_norm": 0.5971204042434692, "learning_rate": 0.0002870318186463901, "loss": 0.329, "step": 26 }, { "epoch": 0.41759304011599807, "grad_norm": 0.5656357407569885, "learning_rate": 0.0002853877926524791, "loss": 0.2991, "step": 27 }, { "epoch": 0.4330594490091832, "grad_norm": 0.9046909809112549, "learning_rate": 0.00028365097862825513, "loss": 0.3522, "step": 28 }, { "epoch": 0.4485258579023683, "grad_norm": 1.8343451023101807, "learning_rate": 0.00028182256689929475, "loss": 0.3253, "step": 29 }, { "epoch": 0.4639922667955534, "grad_norm": 1.3387500047683716, "learning_rate": 0.0002799038105676658, "loss": 0.3363, "step": 30 }, { "epoch": 0.4794586756887385, "grad_norm": 0.7546955943107605, "learning_rate": 0.0002778960246531138, "loss": 0.3319, "step": 31 }, { "epoch": 0.49492508458192364, "grad_norm": 0.5383890271186829, "learning_rate": 0.0002758005851918136, "loss": 0.4198, "step": 32 }, { "epoch": 0.5103914934751087, "grad_norm": 2.9426941871643066, "learning_rate": 0.0002736189282933023, "loss": 0.3846, "step": 33 }, { "epoch": 0.5258579023682939, "grad_norm": 1.3911627531051636, "learning_rate": 0.0002713525491562421, "loss": 0.2681, "step": 34 }, { "epoch": 0.541324311261479, "grad_norm": 0.33964619040489197, "learning_rate": 0.00026900300104368524, "loss": 0.2078, "step": 35 }, { "epoch": 0.5567907201546641, "grad_norm": 0.6098275780677795, "learning_rate": 0.0002665718942185456, "loss": 0.2027, "step": 36 }, { "epoch": 0.5722571290478492, "grad_norm": 0.5025777816772461, "learning_rate": 0.00026406089484000466, "loss": 0.206, "step": 37 }, { "epoch": 0.5877235379410343, "grad_norm": 0.4298511743545532, "learning_rate": 0.00026147172382160914, "loss": 0.2803, "step": 38 }, { "epoch": 0.6031899468342194, "grad_norm": 0.9432958364486694, "learning_rate": 0.00025880615565184313, "loss": 0.3849, "step": 39 }, { "epoch": 0.6186563557274045, "grad_norm": 0.6062995791435242, "learning_rate": 0.00025606601717798207, "loss": 0.3508, "step": 40 }, { "epoch": 0.6341227646205897, "grad_norm": 0.5804493427276611, "learning_rate": 0.0002532531863540631, "loss": 0.3212, "step": 41 }, { "epoch": 0.6495891735137748, "grad_norm": 0.5816200375556946, "learning_rate": 0.0002503695909538287, "loss": 0.2876, "step": 42 }, { "epoch": 0.6650555824069598, "grad_norm": 0.4420316219329834, "learning_rate": 0.0002474172072495275, "loss": 0.3054, "step": 43 }, { "epoch": 0.680521991300145, "grad_norm": 0.45776253938674927, "learning_rate": 0.0002443980586574756, "loss": 0.279, "step": 44 }, { "epoch": 0.6959884001933301, "grad_norm": 0.7748484015464783, "learning_rate": 0.00024131421435130807, "loss": 0.2875, "step": 45 }, { "epoch": 0.7114548090865153, "grad_norm": 0.5803005695343018, "learning_rate": 0.00023816778784387094, "loss": 0.3088, "step": 46 }, { "epoch": 0.7269212179797003, "grad_norm": 1.1293902397155762, "learning_rate": 0.0002349609355387249, "loss": 0.3188, "step": 47 }, { "epoch": 0.7423876268728854, "grad_norm": 1.2384657859802246, "learning_rate": 0.00023169585525225405, "loss": 0.4065, "step": 48 }, { "epoch": 0.7578540357660706, "grad_norm": 1.8807259798049927, "learning_rate": 0.0002283747847073923, "loss": 0.2961, "step": 49 }, { "epoch": 0.7733204446592556, "grad_norm": 0.7909408211708069, "learning_rate": 0.000225, "loss": 0.2371, "step": 50 }, { "epoch": 0.7733204446592556, "eval_loss": 0.26686012744903564, "eval_runtime": 7.3174, "eval_samples_per_second": 6.833, "eval_steps_per_second": 6.833, "step": 50 }, { "epoch": 0.7887868535524408, "grad_norm": 0.3425706624984741, "learning_rate": 0.00022157381403894124, "loss": 0.1907, "step": 51 }, { "epoch": 0.8042532624456259, "grad_norm": 0.5927841663360596, "learning_rate": 0.00021809857496093199, "loss": 0.1673, "step": 52 }, { "epoch": 0.8197196713388111, "grad_norm": 0.3090386986732483, "learning_rate": 0.00021457666452124428, "loss": 0.2062, "step": 53 }, { "epoch": 0.8351860802319961, "grad_norm": 0.7005581855773926, "learning_rate": 0.00021101049646137003, "loss": 0.3042, "step": 54 }, { "epoch": 0.8506524891251812, "grad_norm": 0.44823092222213745, "learning_rate": 0.00020740251485476345, "loss": 0.2754, "step": 55 }, { "epoch": 0.8661188980183664, "grad_norm": 0.43465909361839294, "learning_rate": 0.000203755192431795, "loss": 0.2858, "step": 56 }, { "epoch": 0.8815853069115515, "grad_norm": 0.45701804757118225, "learning_rate": 0.0002000710288850656, "loss": 0.2677, "step": 57 }, { "epoch": 0.8970517158047366, "grad_norm": 0.36131733655929565, "learning_rate": 0.0001963525491562421, "loss": 0.2443, "step": 58 }, { "epoch": 0.9125181246979217, "grad_norm": 0.3090292811393738, "learning_rate": 0.0001926023017055884, "loss": 0.2337, "step": 59 }, { "epoch": 0.9279845335911068, "grad_norm": 0.4051980972290039, "learning_rate": 0.0001888228567653781, "loss": 0.2617, "step": 60 }, { "epoch": 0.943450942484292, "grad_norm": 0.4554498493671417, "learning_rate": 0.0001850168045783858, "loss": 0.2417, "step": 61 }, { "epoch": 0.958917351377477, "grad_norm": 0.6270922422409058, "learning_rate": 0.00018118675362266385, "loss": 0.2552, "step": 62 }, { "epoch": 0.9743837602706622, "grad_norm": 0.750235378742218, "learning_rate": 0.00017733532882382213, "loss": 0.2594, "step": 63 }, { "epoch": 0.9898501691638473, "grad_norm": 1.0947400331497192, "learning_rate": 0.00017346516975603462, "loss": 0.339, "step": 64 }, { "epoch": 1.0053165780570323, "grad_norm": 0.9929707050323486, "learning_rate": 0.00016957892883300775, "loss": 0.3453, "step": 65 }, { "epoch": 1.0207829869502174, "grad_norm": 0.45975467562675476, "learning_rate": 0.000165679269490148, "loss": 0.2237, "step": 66 }, { "epoch": 1.0362493958434027, "grad_norm": 0.28348809480667114, "learning_rate": 0.00016176886435917675, "loss": 0.1931, "step": 67 }, { "epoch": 1.0517158047365878, "grad_norm": 0.22970043122768402, "learning_rate": 0.0001578503934364416, "loss": 0.1668, "step": 68 }, { "epoch": 1.0671822136297728, "grad_norm": 0.2886936664581299, "learning_rate": 0.00015392654224618098, "loss": 0.1539, "step": 69 }, { "epoch": 1.082648622522958, "grad_norm": 0.3277800977230072, "learning_rate": 0.00015, "loss": 0.2382, "step": 70 }, { "epoch": 1.098115031416143, "grad_norm": 0.7377809882164001, "learning_rate": 0.00014607345775381904, "loss": 0.2307, "step": 71 }, { "epoch": 1.1135814403093283, "grad_norm": 0.4106524884700775, "learning_rate": 0.0001421496065635584, "loss": 0.2433, "step": 72 }, { "epoch": 1.1290478492025133, "grad_norm": 0.5610142350196838, "learning_rate": 0.00013823113564082325, "loss": 0.2568, "step": 73 }, { "epoch": 1.1445142580956984, "grad_norm": 0.9046674370765686, "learning_rate": 0.000134320730509852, "loss": 0.2751, "step": 74 }, { "epoch": 1.1599806669888835, "grad_norm": 0.617001473903656, "learning_rate": 0.00013042107116699228, "loss": 0.2139, "step": 75 }, { "epoch": 1.1599806669888835, "eval_loss": 0.2165301889181137, "eval_runtime": 7.3111, "eval_samples_per_second": 6.839, "eval_steps_per_second": 6.839, "step": 75 }, { "epoch": 1.1754470758820685, "grad_norm": 0.6039713025093079, "learning_rate": 0.00012653483024396533, "loss": 0.2185, "step": 76 }, { "epoch": 1.1909134847752538, "grad_norm": 0.4915192127227783, "learning_rate": 0.00012266467117617787, "loss": 0.2277, "step": 77 }, { "epoch": 1.206379893668439, "grad_norm": 0.31167200207710266, "learning_rate": 0.00011881324637733611, "loss": 0.2205, "step": 78 }, { "epoch": 1.221846302561624, "grad_norm": 0.4483063817024231, "learning_rate": 0.00011498319542161423, "loss": 0.24, "step": 79 }, { "epoch": 1.237312711454809, "grad_norm": 0.4659959077835083, "learning_rate": 0.00011117714323462186, "loss": 0.2655, "step": 80 }, { "epoch": 1.252779120347994, "grad_norm": 0.2737232446670532, "learning_rate": 0.0001073976982944116, "loss": 0.2592, "step": 81 }, { "epoch": 1.2682455292411794, "grad_norm": 0.49031612277030945, "learning_rate": 0.0001036474508437579, "loss": 0.2176, "step": 82 }, { "epoch": 1.2837119381343645, "grad_norm": 0.2844369113445282, "learning_rate": 9.992897111493437e-05, "loss": 0.1823, "step": 83 }, { "epoch": 1.2991783470275495, "grad_norm": 0.30533647537231445, "learning_rate": 9.624480756820496e-05, "loss": 0.1705, "step": 84 }, { "epoch": 1.3146447559207346, "grad_norm": 0.37838345766067505, "learning_rate": 9.259748514523653e-05, "loss": 0.1426, "step": 85 }, { "epoch": 1.3301111648139199, "grad_norm": 0.4842795133590698, "learning_rate": 8.898950353862998e-05, "loss": 0.1921, "step": 86 }, { "epoch": 1.345577573707105, "grad_norm": 0.7472468018531799, "learning_rate": 8.54233354787557e-05, "loss": 0.241, "step": 87 }, { "epoch": 1.36104398260029, "grad_norm": 0.8189084529876709, "learning_rate": 8.190142503906798e-05, "loss": 0.2342, "step": 88 }, { "epoch": 1.376510391493475, "grad_norm": 0.45445919036865234, "learning_rate": 7.842618596105872e-05, "loss": 0.2118, "step": 89 }, { "epoch": 1.3919768003866602, "grad_norm": 0.4599757492542267, "learning_rate": 7.500000000000002e-05, "loss": 0.212, "step": 90 }, { "epoch": 1.4074432092798452, "grad_norm": 0.6367545127868652, "learning_rate": 7.162521529260767e-05, "loss": 0.2341, "step": 91 }, { "epoch": 1.4229096181730305, "grad_norm": 0.5731837749481201, "learning_rate": 6.830414474774594e-05, "loss": 0.2237, "step": 92 }, { "epoch": 1.4383760270662156, "grad_norm": 0.5188281536102295, "learning_rate": 6.50390644612751e-05, "loss": 0.2233, "step": 93 }, { "epoch": 1.4538424359594007, "grad_norm": 0.3524056375026703, "learning_rate": 6.183221215612904e-05, "loss": 0.236, "step": 94 }, { "epoch": 1.4693088448525857, "grad_norm": 0.28299957513809204, "learning_rate": 5.8685785648691894e-05, "loss": 0.2433, "step": 95 }, { "epoch": 1.484775253745771, "grad_norm": 0.3486190736293793, "learning_rate": 5.56019413425244e-05, "loss": 0.2873, "step": 96 }, { "epoch": 1.500241662638956, "grad_norm": 0.28295159339904785, "learning_rate": 5.2582792750472464e-05, "loss": 0.2721, "step": 97 }, { "epoch": 1.5157080715321412, "grad_norm": 0.3966909646987915, "learning_rate": 4.963040904617131e-05, "loss": 0.1962, "step": 98 }, { "epoch": 1.5311744804253262, "grad_norm": 0.3956305682659149, "learning_rate": 4.6746813645936877e-05, "loss": 0.1928, "step": 99 }, { "epoch": 1.5466408893185113, "grad_norm": 0.28398260474205017, "learning_rate": 4.3933982822017876e-05, "loss": 0.1685, "step": 100 }, { "epoch": 1.5466408893185113, "eval_loss": 0.20296970009803772, "eval_runtime": 7.3114, "eval_samples_per_second": 6.839, "eval_steps_per_second": 6.839, "step": 100 }, { "epoch": 1.5621072982116964, "grad_norm": 0.1923961192369461, "learning_rate": 4.1193844348156886e-05, "loss": 0.1462, "step": 101 }, { "epoch": 1.5775737071048814, "grad_norm": 0.18478325009346008, "learning_rate": 3.852827617839084e-05, "loss": 0.1648, "step": 102 }, { "epoch": 1.5930401159980667, "grad_norm": 0.30043265223503113, "learning_rate": 3.593910515999536e-05, "loss": 0.1844, "step": 103 }, { "epoch": 1.6085065248912518, "grad_norm": 0.5838200449943542, "learning_rate": 3.342810578145436e-05, "loss": 0.2355, "step": 104 }, { "epoch": 1.623972933784437, "grad_norm": 0.5884765386581421, "learning_rate": 3.099699895631474e-05, "loss": 0.2287, "step": 105 }, { "epoch": 1.6394393426776221, "grad_norm": 0.4631075859069824, "learning_rate": 2.8647450843757897e-05, "loss": 0.2004, "step": 106 }, { "epoch": 1.6549057515708072, "grad_norm": 0.3879186809062958, "learning_rate": 2.6381071706697644e-05, "loss": 0.1996, "step": 107 }, { "epoch": 1.6703721604639923, "grad_norm": 0.2909069359302521, "learning_rate": 2.4199414808186406e-05, "loss": 0.1985, "step": 108 }, { "epoch": 1.6858385693571774, "grad_norm": 0.34202706813812256, "learning_rate": 2.210397534688617e-05, "loss": 0.1848, "step": 109 }, { "epoch": 1.7013049782503624, "grad_norm": 0.3348611295223236, "learning_rate": 2.009618943233419e-05, "loss": 0.2004, "step": 110 }, { "epoch": 1.7167713871435475, "grad_norm": 0.4890515208244324, "learning_rate": 1.8177433100705207e-05, "loss": 0.2107, "step": 111 }, { "epoch": 1.7322377960367326, "grad_norm": 0.560546875, "learning_rate": 1.634902137174483e-05, "loss": 0.2933, "step": 112 }, { "epoch": 1.7477042049299178, "grad_norm": 0.5725692510604858, "learning_rate": 1.4612207347520938e-05, "loss": 0.2927, "step": 113 }, { "epoch": 1.763170613823103, "grad_norm": 0.33504927158355713, "learning_rate": 1.2968181353609852e-05, "loss": 0.1681, "step": 114 }, { "epoch": 1.7786370227162882, "grad_norm": 0.34945377707481384, "learning_rate": 1.1418070123306989e-05, "loss": 0.1594, "step": 115 }, { "epoch": 1.7941034316094733, "grad_norm": 0.31711897253990173, "learning_rate": 9.962936025419754e-06, "loss": 0.1475, "step": 116 }, { "epoch": 1.8095698405026583, "grad_norm": 0.25584596395492554, "learning_rate": 8.603776336173235e-06, "loss": 0.1476, "step": 117 }, { "epoch": 1.8250362493958434, "grad_norm": 0.34091395139694214, "learning_rate": 7.34152255572697e-06, "loss": 0.1856, "step": 118 }, { "epoch": 1.8405026582890285, "grad_norm": 0.3409639894962311, "learning_rate": 6.1770397697710414e-06, "loss": 0.1982, "step": 119 }, { "epoch": 1.8559690671822135, "grad_norm": 0.2535560131072998, "learning_rate": 5.11112605663977e-06, "loss": 0.1881, "step": 120 }, { "epoch": 1.8714354760753986, "grad_norm": 0.26184314489364624, "learning_rate": 4.144511940348516e-06, "loss": 0.1976, "step": 121 }, { "epoch": 1.886901884968584, "grad_norm": 0.28429871797561646, "learning_rate": 3.2778598899291465e-06, "loss": 0.1938, "step": 122 }, { "epoch": 1.902368293861769, "grad_norm": 0.23394553363323212, "learning_rate": 2.511763865406824e-06, "loss": 0.1655, "step": 123 }, { "epoch": 1.917834702754954, "grad_norm": 0.36569544672966003, "learning_rate": 1.8467489107293509e-06, "loss": 0.2076, "step": 124 }, { "epoch": 1.9333011116481393, "grad_norm": 0.38919389247894287, "learning_rate": 1.2832707939284427e-06, "loss": 0.1977, "step": 125 }, { "epoch": 1.9333011116481393, "eval_loss": 0.18706867098808289, "eval_runtime": 7.3048, "eval_samples_per_second": 6.845, "eval_steps_per_second": 6.845, "step": 125 }, { "epoch": 1.9487675205413244, "grad_norm": 0.2989993989467621, "learning_rate": 8.217156947590064e-07, "loss": 0.2162, "step": 126 }, { "epoch": 1.9642339294345095, "grad_norm": 0.41685956716537476, "learning_rate": 4.623999400308054e-07, "loss": 0.2151, "step": 127 }, { "epoch": 1.9797003383276945, "grad_norm": 0.2608714699745178, "learning_rate": 2.05569786813925e-07, "loss": 0.2491, "step": 128 }, { "epoch": 1.9951667472208796, "grad_norm": 0.4548462927341461, "learning_rate": 5.1401253666411016e-08, "loss": 0.333, "step": 129 }, { "epoch": 2.0106331561140647, "grad_norm": 0.17462550103664398, "learning_rate": 0.0, "loss": 0.1465, "step": 130 } ], "logging_steps": 1, "max_steps": 130, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.052912237505413e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }