{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 8070, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006195786864931847, "grad_norm": 11.78878402709961, "learning_rate": 5.947955390334573e-06, "loss": 3.345, "step": 50 }, { "epoch": 0.012391573729863693, "grad_norm": 5.766342639923096, "learning_rate": 1.2019826517967782e-05, "loss": 0.8099, "step": 100 }, { "epoch": 0.01858736059479554, "grad_norm": 3.6993865966796875, "learning_rate": 1.821561338289963e-05, "loss": 0.3079, "step": 150 }, { "epoch": 0.024783147459727387, "grad_norm": 3.7327823638916016, "learning_rate": 2.4411400247831474e-05, "loss": 0.2089, "step": 200 }, { "epoch": 0.030978934324659233, "grad_norm": 3.176992416381836, "learning_rate": 3.0607187112763326e-05, "loss": 0.1961, "step": 250 }, { "epoch": 0.03717472118959108, "grad_norm": 3.009659767150879, "learning_rate": 3.667905824039653e-05, "loss": 0.186, "step": 300 }, { "epoch": 0.04337050805452292, "grad_norm": 3.961405038833618, "learning_rate": 4.287484510532838e-05, "loss": 0.1607, "step": 350 }, { "epoch": 0.04956629491945477, "grad_norm": 2.818254232406616, "learning_rate": 4.907063197026023e-05, "loss": 0.1483, "step": 400 }, { "epoch": 0.055762081784386616, "grad_norm": 1.836232304573059, "learning_rate": 5.526641883519207e-05, "loss": 0.1391, "step": 450 }, { "epoch": 0.061957868649318466, "grad_norm": 2.5643157958984375, "learning_rate": 6.146220570012391e-05, "loss": 0.1343, "step": 500 }, { "epoch": 0.06815365551425032, "grad_norm": 2.9712822437286377, "learning_rate": 6.765799256505576e-05, "loss": 0.1235, "step": 550 }, { "epoch": 0.07434944237918216, "grad_norm": 3.6296300888061523, "learning_rate": 7.385377942998762e-05, "loss": 0.1222, "step": 600 }, { "epoch": 0.080545229244114, "grad_norm": 10.417726516723633, "learning_rate": 8.004956629491945e-05, "loss": 0.1186, "step": 650 }, { "epoch": 0.08674101610904585, "grad_norm": 2.5631349086761475, "learning_rate": 8.624535315985131e-05, "loss": 0.128, "step": 700 }, { "epoch": 0.09293680297397769, "grad_norm": 1.6498531103134155, "learning_rate": 9.244114002478316e-05, "loss": 0.1145, "step": 750 }, { "epoch": 0.09913258983890955, "grad_norm": 2.1243278980255127, "learning_rate": 9.8636926889715e-05, "loss": 0.1153, "step": 800 }, { "epoch": 0.10532837670384139, "grad_norm": 1.869721531867981, "learning_rate": 9.946303180503924e-05, "loss": 0.1143, "step": 850 }, { "epoch": 0.11152416356877323, "grad_norm": 1.236275315284729, "learning_rate": 9.877461104226904e-05, "loss": 0.1099, "step": 900 }, { "epoch": 0.11771995043370508, "grad_norm": 2.4983303546905518, "learning_rate": 9.808619027949883e-05, "loss": 0.1018, "step": 950 }, { "epoch": 0.12391573729863693, "grad_norm": 1.4123425483703613, "learning_rate": 9.739776951672863e-05, "loss": 0.1079, "step": 1000 }, { "epoch": 0.13011152416356878, "grad_norm": 2.497483253479004, "learning_rate": 9.670934875395842e-05, "loss": 0.0982, "step": 1050 }, { "epoch": 0.13630731102850063, "grad_norm": 1.100753903388977, "learning_rate": 9.602092799118821e-05, "loss": 0.0949, "step": 1100 }, { "epoch": 0.14250309789343246, "grad_norm": 2.238828182220459, "learning_rate": 9.533250722841801e-05, "loss": 0.0945, "step": 1150 }, { "epoch": 0.14869888475836432, "grad_norm": 1.5351171493530273, "learning_rate": 9.46440864656478e-05, "loss": 0.0952, "step": 1200 }, { "epoch": 0.15489467162329615, "grad_norm": 3.6645586490631104, "learning_rate": 9.39556657028776e-05, "loss": 0.1162, "step": 1250 }, { "epoch": 0.161090458488228, "grad_norm": 2.153944730758667, "learning_rate": 9.32672449401074e-05, "loss": 0.0965, "step": 1300 }, { "epoch": 0.16728624535315986, "grad_norm": 5.332737922668457, "learning_rate": 9.25788241773372e-05, "loss": 0.0847, "step": 1350 }, { "epoch": 0.1734820322180917, "grad_norm": 1.9773648977279663, "learning_rate": 9.189040341456699e-05, "loss": 0.0916, "step": 1400 }, { "epoch": 0.17967781908302355, "grad_norm": 1.072949767112732, "learning_rate": 9.120198265179678e-05, "loss": 0.0821, "step": 1450 }, { "epoch": 0.18587360594795538, "grad_norm": 0.5937812924385071, "learning_rate": 9.051356188902658e-05, "loss": 0.0693, "step": 1500 }, { "epoch": 0.19206939281288724, "grad_norm": 3.5840229988098145, "learning_rate": 8.982514112625638e-05, "loss": 0.0753, "step": 1550 }, { "epoch": 0.1982651796778191, "grad_norm": 1.8539464473724365, "learning_rate": 8.913672036348617e-05, "loss": 0.072, "step": 1600 }, { "epoch": 0.20446096654275092, "grad_norm": 1.8574514389038086, "learning_rate": 8.844829960071597e-05, "loss": 0.0718, "step": 1650 }, { "epoch": 0.21065675340768278, "grad_norm": 2.465460777282715, "learning_rate": 8.775987883794575e-05, "loss": 0.0715, "step": 1700 }, { "epoch": 0.21685254027261464, "grad_norm": 1.191398024559021, "learning_rate": 8.707145807517555e-05, "loss": 0.0711, "step": 1750 }, { "epoch": 0.22304832713754646, "grad_norm": 1.4969195127487183, "learning_rate": 8.638303731240535e-05, "loss": 0.0731, "step": 1800 }, { "epoch": 0.22924411400247832, "grad_norm": 1.8418350219726562, "learning_rate": 8.569461654963514e-05, "loss": 0.0636, "step": 1850 }, { "epoch": 0.23543990086741015, "grad_norm": 2.362734794616699, "learning_rate": 8.500619578686494e-05, "loss": 0.076, "step": 1900 }, { "epoch": 0.241635687732342, "grad_norm": 1.3227559328079224, "learning_rate": 8.431777502409473e-05, "loss": 0.06, "step": 1950 }, { "epoch": 0.24783147459727387, "grad_norm": 2.1812291145324707, "learning_rate": 8.362935426132452e-05, "loss": 0.0582, "step": 2000 }, { "epoch": 0.2540272614622057, "grad_norm": 0.8610977530479431, "learning_rate": 8.294093349855432e-05, "loss": 0.056, "step": 2050 }, { "epoch": 0.26022304832713755, "grad_norm": 1.2188324928283691, "learning_rate": 8.225251273578411e-05, "loss": 0.0539, "step": 2100 }, { "epoch": 0.2664188351920694, "grad_norm": 0.6571711301803589, "learning_rate": 8.156409197301392e-05, "loss": 0.0548, "step": 2150 }, { "epoch": 0.27261462205700127, "grad_norm": 9.587064743041992, "learning_rate": 8.08756712102437e-05, "loss": 0.0522, "step": 2200 }, { "epoch": 0.2788104089219331, "grad_norm": 1.4897613525390625, "learning_rate": 8.018725044747349e-05, "loss": 0.0429, "step": 2250 }, { "epoch": 0.2850061957868649, "grad_norm": 1.3524194955825806, "learning_rate": 7.94988296847033e-05, "loss": 0.0543, "step": 2300 }, { "epoch": 0.29120198265179675, "grad_norm": 0.49432888627052307, "learning_rate": 7.88241773371885e-05, "loss": 0.055, "step": 2350 }, { "epoch": 0.29739776951672864, "grad_norm": 1.082529902458191, "learning_rate": 7.813575657441828e-05, "loss": 0.0449, "step": 2400 }, { "epoch": 0.30359355638166047, "grad_norm": 3.580268144607544, "learning_rate": 7.744733581164808e-05, "loss": 0.0526, "step": 2450 }, { "epoch": 0.3097893432465923, "grad_norm": 3.7611606121063232, "learning_rate": 7.675891504887788e-05, "loss": 0.0432, "step": 2500 }, { "epoch": 0.3159851301115242, "grad_norm": 3.605032205581665, "learning_rate": 7.607049428610767e-05, "loss": 0.0513, "step": 2550 }, { "epoch": 0.322180916976456, "grad_norm": 0.8407074809074402, "learning_rate": 7.538207352333747e-05, "loss": 0.0478, "step": 2600 }, { "epoch": 0.32837670384138784, "grad_norm": 1.2463380098342896, "learning_rate": 7.469365276056726e-05, "loss": 0.041, "step": 2650 }, { "epoch": 0.3345724907063197, "grad_norm": 1.2079633474349976, "learning_rate": 7.400523199779705e-05, "loss": 0.0455, "step": 2700 }, { "epoch": 0.34076827757125155, "grad_norm": 1.7008190155029297, "learning_rate": 7.331681123502685e-05, "loss": 0.0515, "step": 2750 }, { "epoch": 0.3469640644361834, "grad_norm": 0.6003010272979736, "learning_rate": 7.262839047225664e-05, "loss": 0.0455, "step": 2800 }, { "epoch": 0.35315985130111527, "grad_norm": 0.7806116342544556, "learning_rate": 7.193996970948645e-05, "loss": 0.0429, "step": 2850 }, { "epoch": 0.3593556381660471, "grad_norm": 1.190233826637268, "learning_rate": 7.125154894671623e-05, "loss": 0.037, "step": 2900 }, { "epoch": 0.3655514250309789, "grad_norm": 0.49826720356941223, "learning_rate": 7.056312818394602e-05, "loss": 0.0329, "step": 2950 }, { "epoch": 0.37174721189591076, "grad_norm": 1.38163423538208, "learning_rate": 6.987470742117583e-05, "loss": 0.0381, "step": 3000 }, { "epoch": 0.37794299876084264, "grad_norm": 0.6930022835731506, "learning_rate": 6.918628665840562e-05, "loss": 0.0381, "step": 3050 }, { "epoch": 0.38413878562577447, "grad_norm": 0.3004520535469055, "learning_rate": 6.849786589563542e-05, "loss": 0.0373, "step": 3100 }, { "epoch": 0.3903345724907063, "grad_norm": 1.0762969255447388, "learning_rate": 6.78094451328652e-05, "loss": 0.0405, "step": 3150 }, { "epoch": 0.3965303593556382, "grad_norm": 0.9407225251197815, "learning_rate": 6.712102437009501e-05, "loss": 0.0344, "step": 3200 }, { "epoch": 0.40272614622057, "grad_norm": 1.1790508031845093, "learning_rate": 6.64326036073248e-05, "loss": 0.0345, "step": 3250 }, { "epoch": 0.40892193308550184, "grad_norm": 0.6774219274520874, "learning_rate": 6.574418284455459e-05, "loss": 0.0327, "step": 3300 }, { "epoch": 0.41511771995043373, "grad_norm": 0.5361367464065552, "learning_rate": 6.505576208178439e-05, "loss": 0.0271, "step": 3350 }, { "epoch": 0.42131350681536556, "grad_norm": 0.2760148048400879, "learning_rate": 6.436734131901418e-05, "loss": 0.0314, "step": 3400 }, { "epoch": 0.4275092936802974, "grad_norm": 0.7887628674507141, "learning_rate": 6.367892055624398e-05, "loss": 0.0352, "step": 3450 }, { "epoch": 0.43370508054522927, "grad_norm": 1.0329231023788452, "learning_rate": 6.299049979347378e-05, "loss": 0.0262, "step": 3500 }, { "epoch": 0.4399008674101611, "grad_norm": 0.9285022020339966, "learning_rate": 6.230207903070356e-05, "loss": 0.0295, "step": 3550 }, { "epoch": 0.44609665427509293, "grad_norm": 0.23095445334911346, "learning_rate": 6.161365826793336e-05, "loss": 0.0284, "step": 3600 }, { "epoch": 0.45229244114002476, "grad_norm": 0.42350977659225464, "learning_rate": 6.092523750516316e-05, "loss": 0.0248, "step": 3650 }, { "epoch": 0.45848822800495664, "grad_norm": 0.9503557085990906, "learning_rate": 6.023681674239295e-05, "loss": 0.0239, "step": 3700 }, { "epoch": 0.4646840148698885, "grad_norm": 1.4695123434066772, "learning_rate": 5.954839597962275e-05, "loss": 0.0272, "step": 3750 }, { "epoch": 0.4708798017348203, "grad_norm": 0.835435688495636, "learning_rate": 5.8859975216852544e-05, "loss": 0.0253, "step": 3800 }, { "epoch": 0.4770755885997522, "grad_norm": 0.8063173294067383, "learning_rate": 5.817155445408233e-05, "loss": 0.0288, "step": 3850 }, { "epoch": 0.483271375464684, "grad_norm": 0.5583626627922058, "learning_rate": 5.748313369131213e-05, "loss": 0.0267, "step": 3900 }, { "epoch": 0.48946716232961585, "grad_norm": 0.3900113105773926, "learning_rate": 5.6794712928541925e-05, "loss": 0.027, "step": 3950 }, { "epoch": 0.49566294919454773, "grad_norm": 0.684743344783783, "learning_rate": 5.610629216577172e-05, "loss": 0.0233, "step": 4000 }, { "epoch": 0.5018587360594795, "grad_norm": 1.3466298580169678, "learning_rate": 5.541787140300152e-05, "loss": 0.0237, "step": 4050 }, { "epoch": 0.5080545229244114, "grad_norm": 0.814662754535675, "learning_rate": 5.472945064023132e-05, "loss": 0.0224, "step": 4100 }, { "epoch": 0.5142503097893433, "grad_norm": 0.20325958728790283, "learning_rate": 5.40410298774611e-05, "loss": 0.0252, "step": 4150 }, { "epoch": 0.5204460966542751, "grad_norm": 0.9722644686698914, "learning_rate": 5.3352609114690896e-05, "loss": 0.0228, "step": 4200 }, { "epoch": 0.5266418835192069, "grad_norm": 0.8962277770042419, "learning_rate": 5.26641883519207e-05, "loss": 0.0243, "step": 4250 }, { "epoch": 0.5328376703841388, "grad_norm": 0.8505231738090515, "learning_rate": 5.1975767589150495e-05, "loss": 0.0269, "step": 4300 }, { "epoch": 0.5390334572490706, "grad_norm": 1.4101253747940063, "learning_rate": 5.128734682638029e-05, "loss": 0.0228, "step": 4350 }, { "epoch": 0.5452292441140025, "grad_norm": 1.0223798751831055, "learning_rate": 5.0598926063610086e-05, "loss": 0.0219, "step": 4400 }, { "epoch": 0.5514250309789344, "grad_norm": 0.3649487793445587, "learning_rate": 4.9910505300839875e-05, "loss": 0.0215, "step": 4450 }, { "epoch": 0.5576208178438662, "grad_norm": 0.467814564704895, "learning_rate": 4.922208453806967e-05, "loss": 0.0242, "step": 4500 }, { "epoch": 0.563816604708798, "grad_norm": 1.218153953552246, "learning_rate": 4.8533663775299466e-05, "loss": 0.0199, "step": 4550 }, { "epoch": 0.5700123915737298, "grad_norm": 0.2747304141521454, "learning_rate": 4.784524301252926e-05, "loss": 0.0196, "step": 4600 }, { "epoch": 0.5762081784386617, "grad_norm": 0.572721004486084, "learning_rate": 4.715682224975905e-05, "loss": 0.0171, "step": 4650 }, { "epoch": 0.5824039653035935, "grad_norm": 0.48146089911460876, "learning_rate": 4.646840148698885e-05, "loss": 0.0161, "step": 4700 }, { "epoch": 0.5885997521685254, "grad_norm": 0.3745860457420349, "learning_rate": 4.577998072421865e-05, "loss": 0.0182, "step": 4750 }, { "epoch": 0.5947955390334573, "grad_norm": 0.5441644191741943, "learning_rate": 4.509155996144844e-05, "loss": 0.019, "step": 4800 }, { "epoch": 0.6009913258983891, "grad_norm": 0.39052465558052063, "learning_rate": 4.4403139198678234e-05, "loss": 0.0192, "step": 4850 }, { "epoch": 0.6071871127633209, "grad_norm": 0.2999328672885895, "learning_rate": 4.371471843590803e-05, "loss": 0.018, "step": 4900 }, { "epoch": 0.6133828996282528, "grad_norm": 0.5317378044128418, "learning_rate": 4.3026297673137825e-05, "loss": 0.016, "step": 4950 }, { "epoch": 0.6195786864931846, "grad_norm": 1.1451416015625, "learning_rate": 4.233787691036762e-05, "loss": 0.0167, "step": 5000 }, { "epoch": 0.6257744733581165, "grad_norm": 1.846800684928894, "learning_rate": 4.164945614759742e-05, "loss": 0.0155, "step": 5050 }, { "epoch": 0.6319702602230484, "grad_norm": 0.2182386815547943, "learning_rate": 4.0961035384827206e-05, "loss": 0.0151, "step": 5100 }, { "epoch": 0.6381660470879802, "grad_norm": 0.6467189788818359, "learning_rate": 4.0272614622057e-05, "loss": 0.0169, "step": 5150 }, { "epoch": 0.644361833952912, "grad_norm": 1.209778070449829, "learning_rate": 3.95841938592868e-05, "loss": 0.0151, "step": 5200 }, { "epoch": 0.6505576208178439, "grad_norm": 0.8446183204650879, "learning_rate": 3.889577309651659e-05, "loss": 0.0152, "step": 5250 }, { "epoch": 0.6567534076827757, "grad_norm": 1.0092103481292725, "learning_rate": 3.820735233374639e-05, "loss": 0.0141, "step": 5300 }, { "epoch": 0.6629491945477075, "grad_norm": 0.4524877369403839, "learning_rate": 3.7518931570976184e-05, "loss": 0.0167, "step": 5350 }, { "epoch": 0.6691449814126395, "grad_norm": 0.09342479705810547, "learning_rate": 3.683051080820597e-05, "loss": 0.0159, "step": 5400 }, { "epoch": 0.6753407682775713, "grad_norm": 0.5114253759384155, "learning_rate": 3.6142090045435776e-05, "loss": 0.0163, "step": 5450 }, { "epoch": 0.6815365551425031, "grad_norm": 0.1552111655473709, "learning_rate": 3.545366928266557e-05, "loss": 0.0153, "step": 5500 }, { "epoch": 0.6877323420074349, "grad_norm": 0.7425574064254761, "learning_rate": 3.476524851989536e-05, "loss": 0.0122, "step": 5550 }, { "epoch": 0.6939281288723668, "grad_norm": 0.43186843395233154, "learning_rate": 3.4076827757125156e-05, "loss": 0.016, "step": 5600 }, { "epoch": 0.7001239157372986, "grad_norm": 0.7145938277244568, "learning_rate": 3.338840699435495e-05, "loss": 0.0169, "step": 5650 }, { "epoch": 0.7063197026022305, "grad_norm": 0.09214766323566437, "learning_rate": 3.269998623158475e-05, "loss": 0.0132, "step": 5700 }, { "epoch": 0.7125154894671624, "grad_norm": 1.138299584388733, "learning_rate": 3.201156546881454e-05, "loss": 0.0129, "step": 5750 }, { "epoch": 0.7187112763320942, "grad_norm": 1.3126403093338013, "learning_rate": 3.132314470604434e-05, "loss": 0.0128, "step": 5800 }, { "epoch": 0.724907063197026, "grad_norm": 1.0886186361312866, "learning_rate": 3.063472394327413e-05, "loss": 0.0119, "step": 5850 }, { "epoch": 0.7311028500619579, "grad_norm": 0.23192736506462097, "learning_rate": 2.9946303180503927e-05, "loss": 0.0118, "step": 5900 }, { "epoch": 0.7372986369268897, "grad_norm": 0.9914634227752686, "learning_rate": 2.9257882417733723e-05, "loss": 0.0138, "step": 5950 }, { "epoch": 0.7434944237918215, "grad_norm": 0.29283687472343445, "learning_rate": 2.8569461654963515e-05, "loss": 0.0123, "step": 6000 }, { "epoch": 0.7496902106567535, "grad_norm": 0.9173604249954224, "learning_rate": 2.788104089219331e-05, "loss": 0.0104, "step": 6050 }, { "epoch": 0.7558859975216853, "grad_norm": 0.35434839129447937, "learning_rate": 2.7192620129423103e-05, "loss": 0.0147, "step": 6100 }, { "epoch": 0.7620817843866171, "grad_norm": 0.18677830696105957, "learning_rate": 2.65041993666529e-05, "loss": 0.0102, "step": 6150 }, { "epoch": 0.7682775712515489, "grad_norm": 0.28843411803245544, "learning_rate": 2.5815778603882695e-05, "loss": 0.0125, "step": 6200 }, { "epoch": 0.7744733581164808, "grad_norm": 0.5970668792724609, "learning_rate": 2.5127357841112487e-05, "loss": 0.0133, "step": 6250 }, { "epoch": 0.7806691449814126, "grad_norm": 0.4063868224620819, "learning_rate": 2.4438937078342283e-05, "loss": 0.0134, "step": 6300 }, { "epoch": 0.7868649318463445, "grad_norm": 0.7077623605728149, "learning_rate": 2.375051631557208e-05, "loss": 0.0125, "step": 6350 }, { "epoch": 0.7930607187112764, "grad_norm": 0.056457001715898514, "learning_rate": 2.3062095552801874e-05, "loss": 0.0119, "step": 6400 }, { "epoch": 0.7992565055762082, "grad_norm": 0.0479193776845932, "learning_rate": 2.2373674790031666e-05, "loss": 0.0134, "step": 6450 }, { "epoch": 0.80545229244114, "grad_norm": 0.2201872020959854, "learning_rate": 2.1685254027261465e-05, "loss": 0.0144, "step": 6500 }, { "epoch": 0.8116480793060719, "grad_norm": 0.24738062918186188, "learning_rate": 2.0996833264491258e-05, "loss": 0.0119, "step": 6550 }, { "epoch": 0.8178438661710037, "grad_norm": 0.38614606857299805, "learning_rate": 2.0308412501721053e-05, "loss": 0.0109, "step": 6600 }, { "epoch": 0.8240396530359355, "grad_norm": 0.05887860804796219, "learning_rate": 1.9619991738950846e-05, "loss": 0.0098, "step": 6650 }, { "epoch": 0.8302354399008675, "grad_norm": 0.7075309753417969, "learning_rate": 1.893157097618064e-05, "loss": 0.0115, "step": 6700 }, { "epoch": 0.8364312267657993, "grad_norm": 0.39252975583076477, "learning_rate": 1.8243150213410437e-05, "loss": 0.0127, "step": 6750 }, { "epoch": 0.8426270136307311, "grad_norm": 0.4794786870479584, "learning_rate": 1.755472945064023e-05, "loss": 0.0102, "step": 6800 }, { "epoch": 0.8488228004956629, "grad_norm": 0.39937812089920044, "learning_rate": 1.686630868787003e-05, "loss": 0.011, "step": 6850 }, { "epoch": 0.8550185873605948, "grad_norm": 0.7926356792449951, "learning_rate": 1.617788792509982e-05, "loss": 0.0114, "step": 6900 }, { "epoch": 0.8612143742255266, "grad_norm": 1.0466240644454956, "learning_rate": 1.5489467162329617e-05, "loss": 0.0116, "step": 6950 }, { "epoch": 0.8674101610904585, "grad_norm": 0.6414132714271545, "learning_rate": 1.4801046399559412e-05, "loss": 0.0102, "step": 7000 }, { "epoch": 0.8736059479553904, "grad_norm": 0.8947381377220154, "learning_rate": 1.4112625636789206e-05, "loss": 0.0104, "step": 7050 }, { "epoch": 0.8798017348203222, "grad_norm": 0.8255850076675415, "learning_rate": 1.3424204874019e-05, "loss": 0.0098, "step": 7100 }, { "epoch": 0.885997521685254, "grad_norm": 0.16911354660987854, "learning_rate": 1.2735784111248796e-05, "loss": 0.01, "step": 7150 }, { "epoch": 0.8921933085501859, "grad_norm": 0.04949663579463959, "learning_rate": 1.204736334847859e-05, "loss": 0.01, "step": 7200 }, { "epoch": 0.8983890954151177, "grad_norm": 0.3117857277393341, "learning_rate": 1.1358942585708386e-05, "loss": 0.0085, "step": 7250 }, { "epoch": 0.9045848822800495, "grad_norm": 0.38259008526802063, "learning_rate": 1.067052182293818e-05, "loss": 0.0106, "step": 7300 }, { "epoch": 0.9107806691449815, "grad_norm": 0.423921138048172, "learning_rate": 9.982101060167976e-06, "loss": 0.0085, "step": 7350 }, { "epoch": 0.9169764560099133, "grad_norm": 0.2425990253686905, "learning_rate": 9.29368029739777e-06, "loss": 0.0092, "step": 7400 }, { "epoch": 0.9231722428748451, "grad_norm": 0.17992499470710754, "learning_rate": 8.605259534627565e-06, "loss": 0.0095, "step": 7450 }, { "epoch": 0.929368029739777, "grad_norm": 0.634684681892395, "learning_rate": 7.91683877185736e-06, "loss": 0.0099, "step": 7500 }, { "epoch": 0.9355638166047088, "grad_norm": 0.92573481798172, "learning_rate": 7.228418009087154e-06, "loss": 0.0113, "step": 7550 }, { "epoch": 0.9417596034696406, "grad_norm": 0.07547607272863388, "learning_rate": 6.539997246316949e-06, "loss": 0.0077, "step": 7600 }, { "epoch": 0.9479553903345725, "grad_norm": 0.8026949167251587, "learning_rate": 5.851576483546744e-06, "loss": 0.0095, "step": 7650 }, { "epoch": 0.9541511771995044, "grad_norm": 0.40003785490989685, "learning_rate": 5.163155720776539e-06, "loss": 0.0077, "step": 7700 }, { "epoch": 0.9603469640644362, "grad_norm": 0.1786423921585083, "learning_rate": 4.474734958006334e-06, "loss": 0.0089, "step": 7750 }, { "epoch": 0.966542750929368, "grad_norm": 1.094724416732788, "learning_rate": 3.7863141952361286e-06, "loss": 0.009, "step": 7800 }, { "epoch": 0.9727385377942999, "grad_norm": 0.7275987863540649, "learning_rate": 3.0978934324659235e-06, "loss": 0.0077, "step": 7850 }, { "epoch": 0.9789343246592317, "grad_norm": 0.17183449864387512, "learning_rate": 2.409472669695718e-06, "loss": 0.0101, "step": 7900 }, { "epoch": 0.9851301115241635, "grad_norm": 0.138813316822052, "learning_rate": 1.721051906925513e-06, "loss": 0.0088, "step": 7950 }, { "epoch": 0.9913258983890955, "grad_norm": 0.39123526215553284, "learning_rate": 1.0326311441553077e-06, "loss": 0.0098, "step": 8000 }, { "epoch": 0.9975216852540273, "grad_norm": 0.0659351795911789, "learning_rate": 3.442103813851026e-07, "loss": 0.0102, "step": 8050 } ], "logging_steps": 50, "max_steps": 8070, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }