{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500.0, "global_step": 36350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001375515818431912, "grad_norm": 0.06477122753858566, "learning_rate": 0.0001, "loss": 1.7497, "step": 50 }, { "epoch": 0.002751031636863824, "grad_norm": 0.08175177872180939, "learning_rate": 0.0001, "loss": 1.7218, "step": 100 }, { "epoch": 0.0041265474552957355, "grad_norm": 0.08908290416002274, "learning_rate": 0.0001, "loss": 1.7102, "step": 150 }, { "epoch": 0.005502063273727648, "grad_norm": 0.08645089715719223, "learning_rate": 0.0001, "loss": 1.6996, "step": 200 }, { "epoch": 0.0068775790921595595, "grad_norm": 0.07688608765602112, "learning_rate": 0.0001, "loss": 1.6968, "step": 250 }, { "epoch": 0.008253094910591471, "grad_norm": 0.08298292011022568, "learning_rate": 0.0001, "loss": 1.6922, "step": 300 }, { "epoch": 0.009628610729023384, "grad_norm": 0.07124519348144531, "learning_rate": 0.0001, "loss": 1.6874, "step": 350 }, { "epoch": 0.011004126547455296, "grad_norm": 0.0821714997291565, "learning_rate": 0.0001, "loss": 1.6838, "step": 400 }, { "epoch": 0.012379642365887207, "grad_norm": 0.11138464510440826, "learning_rate": 0.0001, "loss": 1.6807, "step": 450 }, { "epoch": 0.013755158184319119, "grad_norm": 0.09057251363992691, "learning_rate": 0.0001, "loss": 1.6772, "step": 500 }, { "epoch": 0.015130674002751032, "grad_norm": 0.10090968757867813, "learning_rate": 0.0001, "loss": 1.6718, "step": 550 }, { "epoch": 0.016506189821182942, "grad_norm": 0.08569780737161636, "learning_rate": 0.0001, "loss": 1.6702, "step": 600 }, { "epoch": 0.017881705639614855, "grad_norm": 0.07728252559900284, "learning_rate": 0.0001, "loss": 1.6671, "step": 650 }, { "epoch": 0.01925722145804677, "grad_norm": 0.08100250363349915, "learning_rate": 0.0001, "loss": 1.6641, "step": 700 }, { "epoch": 0.02063273727647868, "grad_norm": 0.09590116143226624, "learning_rate": 0.0001, "loss": 1.6616, "step": 750 }, { "epoch": 0.02200825309491059, "grad_norm": 0.10437134653329849, "learning_rate": 0.0001, "loss": 1.6607, "step": 800 }, { "epoch": 0.023383768913342505, "grad_norm": 0.08097755908966064, "learning_rate": 0.0001, "loss": 1.6578, "step": 850 }, { "epoch": 0.024759284731774415, "grad_norm": 0.08555827289819717, "learning_rate": 0.0001, "loss": 1.6548, "step": 900 }, { "epoch": 0.026134800550206328, "grad_norm": 0.10720808058977127, "learning_rate": 0.0001, "loss": 1.6528, "step": 950 }, { "epoch": 0.027510316368638238, "grad_norm": 0.11773797124624252, "learning_rate": 0.0001, "loss": 1.6511, "step": 1000 }, { "epoch": 0.02888583218707015, "grad_norm": 0.10159046947956085, "learning_rate": 0.0001, "loss": 1.6474, "step": 1050 }, { "epoch": 0.030261348005502064, "grad_norm": 0.08796145766973495, "learning_rate": 0.0001, "loss": 1.647, "step": 1100 }, { "epoch": 0.03163686382393398, "grad_norm": 0.08194500207901001, "learning_rate": 0.0001, "loss": 1.6459, "step": 1150 }, { "epoch": 0.033012379642365884, "grad_norm": 0.0940510481595993, "learning_rate": 0.0001, "loss": 1.6429, "step": 1200 }, { "epoch": 0.0343878954607978, "grad_norm": 0.08046701550483704, "learning_rate": 0.0001, "loss": 1.6407, "step": 1250 }, { "epoch": 0.03576341127922971, "grad_norm": 0.07953349500894547, "learning_rate": 0.0001, "loss": 1.6407, "step": 1300 }, { "epoch": 0.037138927097661624, "grad_norm": 0.0876886174082756, "learning_rate": 0.0001, "loss": 1.6378, "step": 1350 }, { "epoch": 0.03851444291609354, "grad_norm": 0.0870981439948082, "learning_rate": 0.0001, "loss": 1.6408, "step": 1400 }, { "epoch": 0.039889958734525444, "grad_norm": 0.09412265568971634, "learning_rate": 0.0001, "loss": 1.638, "step": 1450 }, { "epoch": 0.04126547455295736, "grad_norm": 0.08362641930580139, "learning_rate": 0.0001, "loss": 1.6344, "step": 1500 }, { "epoch": 0.04264099037138927, "grad_norm": 0.11198284476995468, "learning_rate": 0.0001, "loss": 1.6354, "step": 1550 }, { "epoch": 0.04401650618982118, "grad_norm": 0.09470899403095245, "learning_rate": 0.0001, "loss": 1.6337, "step": 1600 }, { "epoch": 0.0453920220082531, "grad_norm": 0.11157640069723129, "learning_rate": 0.0001, "loss": 1.6316, "step": 1650 }, { "epoch": 0.04676753782668501, "grad_norm": 0.08970475941896439, "learning_rate": 0.0001, "loss": 1.6324, "step": 1700 }, { "epoch": 0.048143053645116916, "grad_norm": 0.09438284486532211, "learning_rate": 0.0001, "loss": 1.6299, "step": 1750 }, { "epoch": 0.04951856946354883, "grad_norm": 0.09604686498641968, "learning_rate": 0.0001, "loss": 1.6278, "step": 1800 }, { "epoch": 0.05089408528198074, "grad_norm": 0.09955621510744095, "learning_rate": 0.0001, "loss": 1.6282, "step": 1850 }, { "epoch": 0.052269601100412656, "grad_norm": 0.10360520333051682, "learning_rate": 0.0001, "loss": 1.6265, "step": 1900 }, { "epoch": 0.05364511691884457, "grad_norm": 0.1229841411113739, "learning_rate": 0.0001, "loss": 1.6264, "step": 1950 }, { "epoch": 0.055020632737276476, "grad_norm": 0.09015832841396332, "learning_rate": 0.0001, "loss": 1.6248, "step": 2000 }, { "epoch": 0.05639614855570839, "grad_norm": 0.10285497456789017, "learning_rate": 0.0001, "loss": 1.6237, "step": 2050 }, { "epoch": 0.0577716643741403, "grad_norm": 0.07973627001047134, "learning_rate": 0.0001, "loss": 1.6262, "step": 2100 }, { "epoch": 0.059147180192572216, "grad_norm": 0.1072544977068901, "learning_rate": 0.0001, "loss": 1.6246, "step": 2150 }, { "epoch": 0.06052269601100413, "grad_norm": 0.11573298275470734, "learning_rate": 0.0001, "loss": 1.623, "step": 2200 }, { "epoch": 0.061898211829436035, "grad_norm": 0.1113864928483963, "learning_rate": 0.0001, "loss": 1.6189, "step": 2250 }, { "epoch": 0.06327372764786796, "grad_norm": 0.09252315014600754, "learning_rate": 0.0001, "loss": 1.6192, "step": 2300 }, { "epoch": 0.06464924346629987, "grad_norm": 0.09697891771793365, "learning_rate": 0.0001, "loss": 1.6191, "step": 2350 }, { "epoch": 0.06602475928473177, "grad_norm": 0.09384047985076904, "learning_rate": 0.0001, "loss": 1.6165, "step": 2400 }, { "epoch": 0.06740027510316368, "grad_norm": 0.10533461719751358, "learning_rate": 0.0001, "loss": 1.6202, "step": 2450 }, { "epoch": 0.0687757909215956, "grad_norm": 0.08703196048736572, "learning_rate": 0.0001, "loss": 1.618, "step": 2500 }, { "epoch": 0.07015130674002751, "grad_norm": 0.09502206742763519, "learning_rate": 0.0001, "loss": 1.6177, "step": 2550 }, { "epoch": 0.07152682255845942, "grad_norm": 0.09674184769392014, "learning_rate": 0.0001, "loss": 1.6143, "step": 2600 }, { "epoch": 0.07290233837689133, "grad_norm": 0.12614910304546356, "learning_rate": 0.0001, "loss": 1.6125, "step": 2650 }, { "epoch": 0.07427785419532325, "grad_norm": 0.10198106616735458, "learning_rate": 0.0001, "loss": 1.6158, "step": 2700 }, { "epoch": 0.07565337001375516, "grad_norm": 0.09061957895755768, "learning_rate": 0.0001, "loss": 1.6124, "step": 2750 }, { "epoch": 0.07702888583218707, "grad_norm": 0.08632820844650269, "learning_rate": 0.0001, "loss": 1.6113, "step": 2800 }, { "epoch": 0.07840440165061899, "grad_norm": 0.10429545491933823, "learning_rate": 0.0001, "loss": 1.6105, "step": 2850 }, { "epoch": 0.07977991746905089, "grad_norm": 0.104611337184906, "learning_rate": 0.0001, "loss": 1.6105, "step": 2900 }, { "epoch": 0.0811554332874828, "grad_norm": 0.11391541361808777, "learning_rate": 0.0001, "loss": 1.6078, "step": 2950 }, { "epoch": 0.08253094910591471, "grad_norm": 0.1170964241027832, "learning_rate": 0.0001, "loss": 1.6101, "step": 3000 }, { "epoch": 0.08390646492434663, "grad_norm": 0.10005070269107819, "learning_rate": 0.0001, "loss": 1.6096, "step": 3050 }, { "epoch": 0.08528198074277854, "grad_norm": 0.13063783943653107, "learning_rate": 0.0001, "loss": 1.6094, "step": 3100 }, { "epoch": 0.08665749656121045, "grad_norm": 0.10203906893730164, "learning_rate": 0.0001, "loss": 1.609, "step": 3150 }, { "epoch": 0.08803301237964237, "grad_norm": 0.11838550120592117, "learning_rate": 0.0001, "loss": 1.6068, "step": 3200 }, { "epoch": 0.08940852819807428, "grad_norm": 0.16624979674816132, "learning_rate": 0.0001, "loss": 1.6035, "step": 3250 }, { "epoch": 0.0907840440165062, "grad_norm": 0.11730783432722092, "learning_rate": 0.0001, "loss": 1.6074, "step": 3300 }, { "epoch": 0.0921595598349381, "grad_norm": 0.10523674637079239, "learning_rate": 0.0001, "loss": 1.6051, "step": 3350 }, { "epoch": 0.09353507565337002, "grad_norm": 0.10546988248825073, "learning_rate": 0.0001, "loss": 1.604, "step": 3400 }, { "epoch": 0.09491059147180192, "grad_norm": 0.13425269722938538, "learning_rate": 0.0001, "loss": 1.6044, "step": 3450 }, { "epoch": 0.09628610729023383, "grad_norm": 0.12492198497056961, "learning_rate": 0.0001, "loss": 1.6052, "step": 3500 }, { "epoch": 0.09766162310866575, "grad_norm": 0.09005106985569, "learning_rate": 0.0001, "loss": 1.603, "step": 3550 }, { "epoch": 0.09903713892709766, "grad_norm": 0.11914248019456863, "learning_rate": 0.0001, "loss": 1.6027, "step": 3600 }, { "epoch": 0.10041265474552957, "grad_norm": 0.12221172451972961, "learning_rate": 0.0001, "loss": 1.605, "step": 3650 }, { "epoch": 0.10178817056396149, "grad_norm": 0.13399210572242737, "learning_rate": 0.0001, "loss": 1.6039, "step": 3700 }, { "epoch": 0.1031636863823934, "grad_norm": 0.11565663665533066, "learning_rate": 0.0001, "loss": 1.6008, "step": 3750 }, { "epoch": 0.10453920220082531, "grad_norm": 0.12839622795581818, "learning_rate": 0.0001, "loss": 1.6004, "step": 3800 }, { "epoch": 0.10591471801925723, "grad_norm": 0.11184845864772797, "learning_rate": 0.0001, "loss": 1.5975, "step": 3850 }, { "epoch": 0.10729023383768914, "grad_norm": 0.11628763377666473, "learning_rate": 0.0001, "loss": 1.601, "step": 3900 }, { "epoch": 0.10866574965612105, "grad_norm": 0.11737735569477081, "learning_rate": 0.0001, "loss": 1.6011, "step": 3950 }, { "epoch": 0.11004126547455295, "grad_norm": 0.10090334713459015, "learning_rate": 0.0001, "loss": 1.5981, "step": 4000 }, { "epoch": 0.11141678129298486, "grad_norm": 0.11729908734560013, "learning_rate": 0.0001, "loss": 1.5972, "step": 4050 }, { "epoch": 0.11279229711141678, "grad_norm": 0.10134877264499664, "learning_rate": 0.0001, "loss": 1.5974, "step": 4100 }, { "epoch": 0.11416781292984869, "grad_norm": 0.150742307305336, "learning_rate": 0.0001, "loss": 1.5979, "step": 4150 }, { "epoch": 0.1155433287482806, "grad_norm": 0.1354828178882599, "learning_rate": 0.0001, "loss": 1.594, "step": 4200 }, { "epoch": 0.11691884456671252, "grad_norm": 0.10246012359857559, "learning_rate": 0.0001, "loss": 1.5944, "step": 4250 }, { "epoch": 0.11829436038514443, "grad_norm": 0.10707879811525345, "learning_rate": 0.0001, "loss": 1.5975, "step": 4300 }, { "epoch": 0.11966987620357634, "grad_norm": 0.09582670778036118, "learning_rate": 0.0001, "loss": 1.5931, "step": 4350 }, { "epoch": 0.12104539202200826, "grad_norm": 0.11471503973007202, "learning_rate": 0.0001, "loss": 1.5957, "step": 4400 }, { "epoch": 0.12242090784044017, "grad_norm": 0.14393934607505798, "learning_rate": 0.0001, "loss": 1.5947, "step": 4450 }, { "epoch": 0.12379642365887207, "grad_norm": 0.1267063319683075, "learning_rate": 0.0001, "loss": 1.5928, "step": 4500 }, { "epoch": 0.12517193947730398, "grad_norm": 0.10451563447713852, "learning_rate": 0.0001, "loss": 1.5944, "step": 4550 }, { "epoch": 0.1265474552957359, "grad_norm": 0.13244299590587616, "learning_rate": 0.0001, "loss": 1.5935, "step": 4600 }, { "epoch": 0.1279229711141678, "grad_norm": 0.14042487740516663, "learning_rate": 0.0001, "loss": 1.5929, "step": 4650 }, { "epoch": 0.12929848693259974, "grad_norm": 0.12199941277503967, "learning_rate": 0.0001, "loss": 1.5933, "step": 4700 }, { "epoch": 0.13067400275103164, "grad_norm": 0.13133960962295532, "learning_rate": 0.0001, "loss": 1.5904, "step": 4750 }, { "epoch": 0.13204951856946354, "grad_norm": 0.12281449884176254, "learning_rate": 0.0001, "loss": 1.5909, "step": 4800 }, { "epoch": 0.13342503438789546, "grad_norm": 0.1380591243505478, "learning_rate": 0.0001, "loss": 1.5899, "step": 4850 }, { "epoch": 0.13480055020632736, "grad_norm": 0.13320781290531158, "learning_rate": 0.0001, "loss": 1.5924, "step": 4900 }, { "epoch": 0.1361760660247593, "grad_norm": 0.10719151794910431, "learning_rate": 0.0001, "loss": 1.5909, "step": 4950 }, { "epoch": 0.1375515818431912, "grad_norm": 0.17885592579841614, "learning_rate": 0.0001, "loss": 1.591, "step": 5000 }, { "epoch": 0.13892709766162312, "grad_norm": 0.15455111861228943, "learning_rate": 0.0001, "loss": 1.587, "step": 5050 }, { "epoch": 0.14030261348005502, "grad_norm": 0.12887494266033173, "learning_rate": 0.0001, "loss": 1.5886, "step": 5100 }, { "epoch": 0.14167812929848694, "grad_norm": 0.13535436987876892, "learning_rate": 0.0001, "loss": 1.5901, "step": 5150 }, { "epoch": 0.14305364511691884, "grad_norm": 0.12412004172801971, "learning_rate": 0.0001, "loss": 1.5884, "step": 5200 }, { "epoch": 0.14442916093535077, "grad_norm": 0.1510736644268036, "learning_rate": 0.0001, "loss": 1.5879, "step": 5250 }, { "epoch": 0.14580467675378267, "grad_norm": 0.128033846616745, "learning_rate": 0.0001, "loss": 1.5868, "step": 5300 }, { "epoch": 0.14718019257221457, "grad_norm": 0.11286512017250061, "learning_rate": 0.0001, "loss": 1.5859, "step": 5350 }, { "epoch": 0.1485557083906465, "grad_norm": 0.11637207865715027, "learning_rate": 0.0001, "loss": 1.5837, "step": 5400 }, { "epoch": 0.1499312242090784, "grad_norm": 0.13789626955986023, "learning_rate": 0.0001, "loss": 1.5894, "step": 5450 }, { "epoch": 0.15130674002751032, "grad_norm": 0.12487693876028061, "learning_rate": 0.0001, "loss": 1.5851, "step": 5500 }, { "epoch": 0.15268225584594222, "grad_norm": 0.14437325298786163, "learning_rate": 0.0001, "loss": 1.5879, "step": 5550 }, { "epoch": 0.15405777166437415, "grad_norm": 0.10904733836650848, "learning_rate": 0.0001, "loss": 1.5838, "step": 5600 }, { "epoch": 0.15543328748280605, "grad_norm": 0.10461211949586868, "learning_rate": 0.0001, "loss": 1.5833, "step": 5650 }, { "epoch": 0.15680880330123798, "grad_norm": 0.1489093005657196, "learning_rate": 0.0001, "loss": 1.5823, "step": 5700 }, { "epoch": 0.15818431911966988, "grad_norm": 0.15630511939525604, "learning_rate": 0.0001, "loss": 1.5844, "step": 5750 }, { "epoch": 0.15955983493810177, "grad_norm": 0.15836940705776215, "learning_rate": 0.0001, "loss": 1.584, "step": 5800 }, { "epoch": 0.1609353507565337, "grad_norm": 0.12032505124807358, "learning_rate": 0.0001, "loss": 1.5848, "step": 5850 }, { "epoch": 0.1623108665749656, "grad_norm": 0.15543417632579803, "learning_rate": 0.0001, "loss": 1.5843, "step": 5900 }, { "epoch": 0.16368638239339753, "grad_norm": 0.11939691752195358, "learning_rate": 0.0001, "loss": 1.5818, "step": 5950 }, { "epoch": 0.16506189821182943, "grad_norm": 0.13943925499916077, "learning_rate": 0.0001, "loss": 1.5821, "step": 6000 }, { "epoch": 0.16643741403026135, "grad_norm": 0.1273224651813507, "learning_rate": 0.0001, "loss": 1.5807, "step": 6050 }, { "epoch": 0.16781292984869325, "grad_norm": 0.1731129139661789, "learning_rate": 0.0001, "loss": 1.5828, "step": 6100 }, { "epoch": 0.16918844566712518, "grad_norm": 0.11023139208555222, "learning_rate": 0.0001, "loss": 1.5806, "step": 6150 }, { "epoch": 0.17056396148555708, "grad_norm": 0.15180650353431702, "learning_rate": 0.0001, "loss": 1.5805, "step": 6200 }, { "epoch": 0.171939477303989, "grad_norm": 0.1235494539141655, "learning_rate": 0.0001, "loss": 1.5811, "step": 6250 }, { "epoch": 0.1733149931224209, "grad_norm": 0.12696652114391327, "learning_rate": 0.0001, "loss": 1.58, "step": 6300 }, { "epoch": 0.1746905089408528, "grad_norm": 0.1397417187690735, "learning_rate": 0.0001, "loss": 1.5806, "step": 6350 }, { "epoch": 0.17606602475928473, "grad_norm": 0.15651826560497284, "learning_rate": 0.0001, "loss": 1.5774, "step": 6400 }, { "epoch": 0.17744154057771663, "grad_norm": 0.10367725789546967, "learning_rate": 0.0001, "loss": 1.5793, "step": 6450 }, { "epoch": 0.17881705639614856, "grad_norm": 0.15408000349998474, "learning_rate": 0.0001, "loss": 1.5791, "step": 6500 }, { "epoch": 0.18019257221458046, "grad_norm": 0.10724977403879166, "learning_rate": 0.0001, "loss": 1.5799, "step": 6550 }, { "epoch": 0.1815680880330124, "grad_norm": 0.14652323722839355, "learning_rate": 0.0001, "loss": 1.5784, "step": 6600 }, { "epoch": 0.1829436038514443, "grad_norm": 0.11810048669576645, "learning_rate": 0.0001, "loss": 1.5783, "step": 6650 }, { "epoch": 0.1843191196698762, "grad_norm": 0.1892373412847519, "learning_rate": 0.0001, "loss": 1.5811, "step": 6700 }, { "epoch": 0.1856946354883081, "grad_norm": 0.1516016721725464, "learning_rate": 0.0001, "loss": 1.5781, "step": 6750 }, { "epoch": 0.18707015130674004, "grad_norm": 0.14342574775218964, "learning_rate": 0.0001, "loss": 1.5759, "step": 6800 }, { "epoch": 0.18844566712517194, "grad_norm": 0.1327650249004364, "learning_rate": 0.0001, "loss": 1.5779, "step": 6850 }, { "epoch": 0.18982118294360384, "grad_norm": 0.137595072388649, "learning_rate": 0.0001, "loss": 1.5761, "step": 6900 }, { "epoch": 0.19119669876203577, "grad_norm": 0.1387586146593094, "learning_rate": 0.0001, "loss": 1.5768, "step": 6950 }, { "epoch": 0.19257221458046767, "grad_norm": 0.1557263284921646, "learning_rate": 0.0001, "loss": 1.5775, "step": 7000 }, { "epoch": 0.1939477303988996, "grad_norm": 0.14735980331897736, "learning_rate": 0.0001, "loss": 1.5771, "step": 7050 }, { "epoch": 0.1953232462173315, "grad_norm": 0.18839861452579498, "learning_rate": 0.0001, "loss": 1.5748, "step": 7100 }, { "epoch": 0.19669876203576342, "grad_norm": 0.17223089933395386, "learning_rate": 0.0001, "loss": 1.5795, "step": 7150 }, { "epoch": 0.19807427785419532, "grad_norm": 0.11284028738737106, "learning_rate": 0.0001, "loss": 1.5745, "step": 7200 }, { "epoch": 0.19944979367262725, "grad_norm": 0.16285105049610138, "learning_rate": 0.0001, "loss": 1.5763, "step": 7250 }, { "epoch": 0.20082530949105915, "grad_norm": 0.15286004543304443, "learning_rate": 0.0001, "loss": 1.5734, "step": 7300 }, { "epoch": 0.20220082530949107, "grad_norm": 0.15827025473117828, "learning_rate": 0.0001, "loss": 1.5736, "step": 7350 }, { "epoch": 0.20357634112792297, "grad_norm": 0.13479341566562653, "learning_rate": 0.0001, "loss": 1.5755, "step": 7400 }, { "epoch": 0.20495185694635487, "grad_norm": 0.11652766913175583, "learning_rate": 0.0001, "loss": 1.5745, "step": 7450 }, { "epoch": 0.2063273727647868, "grad_norm": 0.1466943770647049, "learning_rate": 0.0001, "loss": 1.5748, "step": 7500 }, { "epoch": 0.2077028885832187, "grad_norm": 0.16038121283054352, "learning_rate": 0.0001, "loss": 1.572, "step": 7550 }, { "epoch": 0.20907840440165062, "grad_norm": 0.1869979202747345, "learning_rate": 0.0001, "loss": 1.5762, "step": 7600 }, { "epoch": 0.21045392022008252, "grad_norm": 0.14036841690540314, "learning_rate": 0.0001, "loss": 1.5754, "step": 7650 }, { "epoch": 0.21182943603851445, "grad_norm": 0.18491779267787933, "learning_rate": 0.0001, "loss": 1.5757, "step": 7700 }, { "epoch": 0.21320495185694635, "grad_norm": 0.13815288245677948, "learning_rate": 0.0001, "loss": 1.5754, "step": 7750 }, { "epoch": 0.21458046767537828, "grad_norm": 0.13334764540195465, "learning_rate": 0.0001, "loss": 1.5706, "step": 7800 }, { "epoch": 0.21595598349381018, "grad_norm": 0.15366512537002563, "learning_rate": 0.0001, "loss": 1.5731, "step": 7850 }, { "epoch": 0.2173314993122421, "grad_norm": 0.16366422176361084, "learning_rate": 0.0001, "loss": 1.5715, "step": 7900 }, { "epoch": 0.218707015130674, "grad_norm": 0.14637479186058044, "learning_rate": 0.0001, "loss": 1.5715, "step": 7950 }, { "epoch": 0.2200825309491059, "grad_norm": 0.1257038414478302, "learning_rate": 0.0001, "loss": 1.5712, "step": 8000 }, { "epoch": 0.22145804676753783, "grad_norm": 0.13014163076877594, "learning_rate": 0.0001, "loss": 1.5711, "step": 8050 }, { "epoch": 0.22283356258596973, "grad_norm": 0.13101409375667572, "learning_rate": 0.0001, "loss": 1.5734, "step": 8100 }, { "epoch": 0.22420907840440166, "grad_norm": 0.1509891152381897, "learning_rate": 0.0001, "loss": 1.5698, "step": 8150 }, { "epoch": 0.22558459422283356, "grad_norm": 0.16276001930236816, "learning_rate": 0.0001, "loss": 1.5714, "step": 8200 }, { "epoch": 0.22696011004126548, "grad_norm": 0.16040217876434326, "learning_rate": 0.0001, "loss": 1.5701, "step": 8250 }, { "epoch": 0.22833562585969738, "grad_norm": 0.160230815410614, "learning_rate": 0.0001, "loss": 1.5705, "step": 8300 }, { "epoch": 0.2297111416781293, "grad_norm": 0.18454241752624512, "learning_rate": 0.0001, "loss": 1.571, "step": 8350 }, { "epoch": 0.2310866574965612, "grad_norm": 0.17411856353282928, "learning_rate": 0.0001, "loss": 1.5679, "step": 8400 }, { "epoch": 0.2324621733149931, "grad_norm": 0.16710075736045837, "learning_rate": 0.0001, "loss": 1.5674, "step": 8450 }, { "epoch": 0.23383768913342504, "grad_norm": 0.12378160655498505, "learning_rate": 0.0001, "loss": 1.5671, "step": 8500 }, { "epoch": 0.23521320495185694, "grad_norm": 0.11550536751747131, "learning_rate": 0.0001, "loss": 1.5698, "step": 8550 }, { "epoch": 0.23658872077028886, "grad_norm": 0.17768432199954987, "learning_rate": 0.0001, "loss": 1.5699, "step": 8600 }, { "epoch": 0.23796423658872076, "grad_norm": 0.15126097202301025, "learning_rate": 0.0001, "loss": 1.5694, "step": 8650 }, { "epoch": 0.2393397524071527, "grad_norm": 0.1827315390110016, "learning_rate": 0.0001, "loss": 1.5671, "step": 8700 }, { "epoch": 0.2407152682255846, "grad_norm": 0.11432069540023804, "learning_rate": 0.0001, "loss": 1.5685, "step": 8750 }, { "epoch": 0.24209078404401652, "grad_norm": 0.14279188215732574, "learning_rate": 0.0001, "loss": 1.5677, "step": 8800 }, { "epoch": 0.24346629986244842, "grad_norm": 0.13771188259124756, "learning_rate": 0.0001, "loss": 1.5667, "step": 8850 }, { "epoch": 0.24484181568088034, "grad_norm": 0.12438327074050903, "learning_rate": 0.0001, "loss": 1.5649, "step": 8900 }, { "epoch": 0.24621733149931224, "grad_norm": 0.146587535738945, "learning_rate": 0.0001, "loss": 1.5689, "step": 8950 }, { "epoch": 0.24759284731774414, "grad_norm": 0.13684628903865814, "learning_rate": 0.0001, "loss": 1.5662, "step": 9000 }, { "epoch": 0.24896836313617607, "grad_norm": 0.1465720385313034, "learning_rate": 0.0001, "loss": 1.5666, "step": 9050 }, { "epoch": 0.25034387895460797, "grad_norm": 0.1553189605474472, "learning_rate": 0.0001, "loss": 1.5647, "step": 9100 }, { "epoch": 0.2517193947730399, "grad_norm": 0.12973164021968842, "learning_rate": 0.0001, "loss": 1.5647, "step": 9150 }, { "epoch": 0.2530949105914718, "grad_norm": 0.17071610689163208, "learning_rate": 0.0001, "loss": 1.5691, "step": 9200 }, { "epoch": 0.2544704264099037, "grad_norm": 0.1424863487482071, "learning_rate": 0.0001, "loss": 1.5654, "step": 9250 }, { "epoch": 0.2558459422283356, "grad_norm": 0.13117440044879913, "learning_rate": 0.0001, "loss": 1.5668, "step": 9300 }, { "epoch": 0.25722145804676755, "grad_norm": 0.14353643357753754, "learning_rate": 0.0001, "loss": 1.567, "step": 9350 }, { "epoch": 0.2585969738651995, "grad_norm": 0.18137438595294952, "learning_rate": 0.0001, "loss": 1.5648, "step": 9400 }, { "epoch": 0.25997248968363135, "grad_norm": 0.1453561782836914, "learning_rate": 0.0001, "loss": 1.5631, "step": 9450 }, { "epoch": 0.2613480055020633, "grad_norm": 0.13514567911624908, "learning_rate": 0.0001, "loss": 1.5633, "step": 9500 }, { "epoch": 0.2627235213204952, "grad_norm": 0.20019495487213135, "learning_rate": 0.0001, "loss": 1.5655, "step": 9550 }, { "epoch": 0.2640990371389271, "grad_norm": 0.18167296051979065, "learning_rate": 0.0001, "loss": 1.5634, "step": 9600 }, { "epoch": 0.265474552957359, "grad_norm": 0.1335984319448471, "learning_rate": 0.0001, "loss": 1.5609, "step": 9650 }, { "epoch": 0.2668500687757909, "grad_norm": 0.12064065039157867, "learning_rate": 0.0001, "loss": 1.5619, "step": 9700 }, { "epoch": 0.26822558459422285, "grad_norm": 0.16066288948059082, "learning_rate": 0.0001, "loss": 1.5639, "step": 9750 }, { "epoch": 0.2696011004126547, "grad_norm": 0.18084204196929932, "learning_rate": 0.0001, "loss": 1.5597, "step": 9800 }, { "epoch": 0.27097661623108665, "grad_norm": 0.14845338463783264, "learning_rate": 0.0001, "loss": 1.5626, "step": 9850 }, { "epoch": 0.2723521320495186, "grad_norm": 0.13293515145778656, "learning_rate": 0.0001, "loss": 1.5648, "step": 9900 }, { "epoch": 0.2737276478679505, "grad_norm": 0.14939668774604797, "learning_rate": 0.0001, "loss": 1.5612, "step": 9950 }, { "epoch": 0.2751031636863824, "grad_norm": 0.1553388386964798, "learning_rate": 0.0001, "loss": 1.5629, "step": 10000 }, { "epoch": 0.2764786795048143, "grad_norm": 0.22416375577449799, "learning_rate": 0.0001, "loss": 1.5621, "step": 10050 }, { "epoch": 0.27785419532324623, "grad_norm": 0.2197302132844925, "learning_rate": 0.0001, "loss": 1.5635, "step": 10100 }, { "epoch": 0.2792297111416781, "grad_norm": 0.17688524723052979, "learning_rate": 0.0001, "loss": 1.5616, "step": 10150 }, { "epoch": 0.28060522696011003, "grad_norm": 0.1495491862297058, "learning_rate": 0.0001, "loss": 1.5614, "step": 10200 }, { "epoch": 0.28198074277854196, "grad_norm": 0.15716291964054108, "learning_rate": 0.0001, "loss": 1.5592, "step": 10250 }, { "epoch": 0.2833562585969739, "grad_norm": 0.14116239547729492, "learning_rate": 0.0001, "loss": 1.5586, "step": 10300 }, { "epoch": 0.28473177441540576, "grad_norm": 0.11010037362575531, "learning_rate": 0.0001, "loss": 1.5603, "step": 10350 }, { "epoch": 0.2861072902338377, "grad_norm": 0.1838681697845459, "learning_rate": 0.0001, "loss": 1.561, "step": 10400 }, { "epoch": 0.2874828060522696, "grad_norm": 0.19001850485801697, "learning_rate": 0.0001, "loss": 1.5588, "step": 10450 }, { "epoch": 0.28885832187070154, "grad_norm": 0.20800583064556122, "learning_rate": 0.0001, "loss": 1.5607, "step": 10500 }, { "epoch": 0.2902338376891334, "grad_norm": 0.17948520183563232, "learning_rate": 0.0001, "loss": 1.56, "step": 10550 }, { "epoch": 0.29160935350756534, "grad_norm": 0.16178689897060394, "learning_rate": 0.0001, "loss": 1.5603, "step": 10600 }, { "epoch": 0.29298486932599727, "grad_norm": 0.1580880880355835, "learning_rate": 0.0001, "loss": 1.5606, "step": 10650 }, { "epoch": 0.29436038514442914, "grad_norm": 0.14434567093849182, "learning_rate": 0.0001, "loss": 1.5618, "step": 10700 }, { "epoch": 0.29573590096286106, "grad_norm": 0.17610964179039001, "learning_rate": 0.0001, "loss": 1.5613, "step": 10750 }, { "epoch": 0.297111416781293, "grad_norm": 0.15156705677509308, "learning_rate": 0.0001, "loss": 1.5563, "step": 10800 }, { "epoch": 0.2984869325997249, "grad_norm": 0.1466618925333023, "learning_rate": 0.0001, "loss": 1.5616, "step": 10850 }, { "epoch": 0.2998624484181568, "grad_norm": 0.1162666529417038, "learning_rate": 0.0001, "loss": 1.559, "step": 10900 }, { "epoch": 0.3012379642365887, "grad_norm": 0.15534426271915436, "learning_rate": 0.0001, "loss": 1.5594, "step": 10950 }, { "epoch": 0.30261348005502064, "grad_norm": 0.15940657258033752, "learning_rate": 0.0001, "loss": 1.5613, "step": 11000 }, { "epoch": 0.30398899587345257, "grad_norm": 0.1757323294878006, "learning_rate": 0.0001, "loss": 1.5588, "step": 11050 }, { "epoch": 0.30536451169188444, "grad_norm": 0.11815246194601059, "learning_rate": 0.0001, "loss": 1.5589, "step": 11100 }, { "epoch": 0.30674002751031637, "grad_norm": 0.2773960828781128, "learning_rate": 0.0001, "loss": 1.5584, "step": 11150 }, { "epoch": 0.3081155433287483, "grad_norm": 0.12601600587368011, "learning_rate": 0.0001, "loss": 1.5572, "step": 11200 }, { "epoch": 0.30949105914718017, "grad_norm": 0.1593768298625946, "learning_rate": 0.0001, "loss": 1.5575, "step": 11250 }, { "epoch": 0.3108665749656121, "grad_norm": 0.149438738822937, "learning_rate": 0.0001, "loss": 1.557, "step": 11300 }, { "epoch": 0.312242090784044, "grad_norm": 0.11111125349998474, "learning_rate": 0.0001, "loss": 1.5587, "step": 11350 }, { "epoch": 0.31361760660247595, "grad_norm": 0.1610383540391922, "learning_rate": 0.0001, "loss": 1.5572, "step": 11400 }, { "epoch": 0.3149931224209078, "grad_norm": 0.17420324683189392, "learning_rate": 0.0001, "loss": 1.5581, "step": 11450 }, { "epoch": 0.31636863823933975, "grad_norm": 0.16623131930828094, "learning_rate": 0.0001, "loss": 1.5561, "step": 11500 }, { "epoch": 0.3177441540577717, "grad_norm": 0.15828974545001984, "learning_rate": 0.0001, "loss": 1.5544, "step": 11550 }, { "epoch": 0.31911966987620355, "grad_norm": 0.15183350443840027, "learning_rate": 0.0001, "loss": 1.5555, "step": 11600 }, { "epoch": 0.3204951856946355, "grad_norm": 0.16378933191299438, "learning_rate": 0.0001, "loss": 1.5532, "step": 11650 }, { "epoch": 0.3218707015130674, "grad_norm": 0.15861773490905762, "learning_rate": 0.0001, "loss": 1.5568, "step": 11700 }, { "epoch": 0.32324621733149933, "grad_norm": 0.13385528326034546, "learning_rate": 0.0001, "loss": 1.5568, "step": 11750 }, { "epoch": 0.3246217331499312, "grad_norm": 0.16392391920089722, "learning_rate": 0.0001, "loss": 1.5548, "step": 11800 }, { "epoch": 0.32599724896836313, "grad_norm": 0.14662721753120422, "learning_rate": 0.0001, "loss": 1.5539, "step": 11850 }, { "epoch": 0.32737276478679506, "grad_norm": 0.13727930188179016, "learning_rate": 0.0001, "loss": 1.5552, "step": 11900 }, { "epoch": 0.328748280605227, "grad_norm": 0.15576840937137604, "learning_rate": 0.0001, "loss": 1.5552, "step": 11950 }, { "epoch": 0.33012379642365886, "grad_norm": 0.1717185378074646, "learning_rate": 0.0001, "loss": 1.5538, "step": 12000 }, { "epoch": 0.3314993122420908, "grad_norm": 0.16970685124397278, "learning_rate": 0.0001, "loss": 1.5556, "step": 12050 }, { "epoch": 0.3328748280605227, "grad_norm": 0.1489485800266266, "learning_rate": 0.0001, "loss": 1.5527, "step": 12100 }, { "epoch": 0.3342503438789546, "grad_norm": 0.1374077945947647, "learning_rate": 0.0001, "loss": 1.5528, "step": 12150 }, { "epoch": 0.3356258596973865, "grad_norm": 0.19402620196342468, "learning_rate": 0.0001, "loss": 1.5547, "step": 12200 }, { "epoch": 0.33700137551581844, "grad_norm": 0.1642199009656906, "learning_rate": 0.0001, "loss": 1.5538, "step": 12250 }, { "epoch": 0.33837689133425036, "grad_norm": 0.13107603788375854, "learning_rate": 0.0001, "loss": 1.5547, "step": 12300 }, { "epoch": 0.33975240715268223, "grad_norm": 0.1858353465795517, "learning_rate": 0.0001, "loss": 1.5526, "step": 12350 }, { "epoch": 0.34112792297111416, "grad_norm": 0.1422649323940277, "learning_rate": 0.0001, "loss": 1.5523, "step": 12400 }, { "epoch": 0.3425034387895461, "grad_norm": 0.16968269646167755, "learning_rate": 0.0001, "loss": 1.554, "step": 12450 }, { "epoch": 0.343878954607978, "grad_norm": 0.1434723138809204, "learning_rate": 0.0001, "loss": 1.5544, "step": 12500 }, { "epoch": 0.3452544704264099, "grad_norm": 0.18616297841072083, "learning_rate": 0.0001, "loss": 1.5506, "step": 12550 }, { "epoch": 0.3466299862448418, "grad_norm": 0.16946491599082947, "learning_rate": 0.0001, "loss": 1.5524, "step": 12600 }, { "epoch": 0.34800550206327374, "grad_norm": 0.17658023536205292, "learning_rate": 0.0001, "loss": 1.5536, "step": 12650 }, { "epoch": 0.3493810178817056, "grad_norm": 0.15203554928302765, "learning_rate": 0.0001, "loss": 1.5507, "step": 12700 }, { "epoch": 0.35075653370013754, "grad_norm": 0.13097505271434784, "learning_rate": 0.0001, "loss": 1.5542, "step": 12750 }, { "epoch": 0.35213204951856947, "grad_norm": 0.14317452907562256, "learning_rate": 0.0001, "loss": 1.5534, "step": 12800 }, { "epoch": 0.3535075653370014, "grad_norm": 0.12445474416017532, "learning_rate": 0.0001, "loss": 1.5535, "step": 12850 }, { "epoch": 0.35488308115543327, "grad_norm": 0.1327485293149948, "learning_rate": 0.0001, "loss": 1.5521, "step": 12900 }, { "epoch": 0.3562585969738652, "grad_norm": 0.15487389266490936, "learning_rate": 0.0001, "loss": 1.553, "step": 12950 }, { "epoch": 0.3576341127922971, "grad_norm": 0.23483023047447205, "learning_rate": 0.0001, "loss": 1.5502, "step": 13000 }, { "epoch": 0.35900962861072905, "grad_norm": 0.14994105696678162, "learning_rate": 0.0001, "loss": 1.5518, "step": 13050 }, { "epoch": 0.3603851444291609, "grad_norm": 0.12222074717283249, "learning_rate": 0.0001, "loss": 1.5508, "step": 13100 }, { "epoch": 0.36176066024759285, "grad_norm": 0.1246858537197113, "learning_rate": 0.0001, "loss": 1.552, "step": 13150 }, { "epoch": 0.3631361760660248, "grad_norm": 0.15825419127941132, "learning_rate": 0.0001, "loss": 1.5499, "step": 13200 }, { "epoch": 0.36451169188445665, "grad_norm": 0.17960667610168457, "learning_rate": 0.0001, "loss": 1.5551, "step": 13250 }, { "epoch": 0.3658872077028886, "grad_norm": 0.1628105491399765, "learning_rate": 0.0001, "loss": 1.5544, "step": 13300 }, { "epoch": 0.3672627235213205, "grad_norm": 0.15981099009513855, "learning_rate": 0.0001, "loss": 1.5527, "step": 13350 }, { "epoch": 0.3686382393397524, "grad_norm": 0.11882206797599792, "learning_rate": 0.0001, "loss": 1.5505, "step": 13400 }, { "epoch": 0.3700137551581843, "grad_norm": 0.1369376927614212, "learning_rate": 0.0001, "loss": 1.5487, "step": 13450 }, { "epoch": 0.3713892709766162, "grad_norm": 0.1341916173696518, "learning_rate": 0.0001, "loss": 1.5489, "step": 13500 }, { "epoch": 0.37276478679504815, "grad_norm": 0.1692420095205307, "learning_rate": 0.0001, "loss": 1.5486, "step": 13550 }, { "epoch": 0.3741403026134801, "grad_norm": 0.12764231860637665, "learning_rate": 0.0001, "loss": 1.5479, "step": 13600 }, { "epoch": 0.37551581843191195, "grad_norm": 0.1610202044248581, "learning_rate": 0.0001, "loss": 1.5493, "step": 13650 }, { "epoch": 0.3768913342503439, "grad_norm": 0.20008735358715057, "learning_rate": 0.0001, "loss": 1.5504, "step": 13700 }, { "epoch": 0.3782668500687758, "grad_norm": 0.14668354392051697, "learning_rate": 0.0001, "loss": 1.5459, "step": 13750 }, { "epoch": 0.3796423658872077, "grad_norm": 0.16147159039974213, "learning_rate": 0.0001, "loss": 1.5497, "step": 13800 }, { "epoch": 0.3810178817056396, "grad_norm": 0.2127738893032074, "learning_rate": 0.0001, "loss": 1.5496, "step": 13850 }, { "epoch": 0.38239339752407153, "grad_norm": 0.14936117827892303, "learning_rate": 0.0001, "loss": 1.5487, "step": 13900 }, { "epoch": 0.38376891334250346, "grad_norm": 0.1460547298192978, "learning_rate": 0.0001, "loss": 1.5513, "step": 13950 }, { "epoch": 0.38514442916093533, "grad_norm": 0.1418396234512329, "learning_rate": 0.0001, "loss": 1.5489, "step": 14000 }, { "epoch": 0.38651994497936726, "grad_norm": 0.12608648836612701, "learning_rate": 0.0001, "loss": 1.5478, "step": 14050 }, { "epoch": 0.3878954607977992, "grad_norm": 0.12352428585290909, "learning_rate": 0.0001, "loss": 1.5472, "step": 14100 }, { "epoch": 0.3892709766162311, "grad_norm": 0.140400692820549, "learning_rate": 0.0001, "loss": 1.5471, "step": 14150 }, { "epoch": 0.390646492434663, "grad_norm": 0.14015322923660278, "learning_rate": 0.0001, "loss": 1.5495, "step": 14200 }, { "epoch": 0.3920220082530949, "grad_norm": 0.13664819300174713, "learning_rate": 0.0001, "loss": 1.5515, "step": 14250 }, { "epoch": 0.39339752407152684, "grad_norm": 0.19558057188987732, "learning_rate": 0.0001, "loss": 1.5493, "step": 14300 }, { "epoch": 0.3947730398899587, "grad_norm": 0.14744845032691956, "learning_rate": 0.0001, "loss": 1.547, "step": 14350 }, { "epoch": 0.39614855570839064, "grad_norm": 0.13610410690307617, "learning_rate": 0.0001, "loss": 1.5499, "step": 14400 }, { "epoch": 0.39752407152682256, "grad_norm": 0.16850556433200836, "learning_rate": 0.0001, "loss": 1.5475, "step": 14450 }, { "epoch": 0.3988995873452545, "grad_norm": 0.11494544893503189, "learning_rate": 0.0001, "loss": 1.5441, "step": 14500 }, { "epoch": 0.40027510316368636, "grad_norm": 0.1311003863811493, "learning_rate": 0.0001, "loss": 1.5451, "step": 14550 }, { "epoch": 0.4016506189821183, "grad_norm": 0.16432379186153412, "learning_rate": 0.0001, "loss": 1.5483, "step": 14600 }, { "epoch": 0.4030261348005502, "grad_norm": 0.16200096905231476, "learning_rate": 0.0001, "loss": 1.5458, "step": 14650 }, { "epoch": 0.40440165061898214, "grad_norm": 0.15324008464813232, "learning_rate": 0.0001, "loss": 1.5486, "step": 14700 }, { "epoch": 0.405777166437414, "grad_norm": 0.2114071398973465, "learning_rate": 0.0001, "loss": 1.5463, "step": 14750 }, { "epoch": 0.40715268225584594, "grad_norm": 0.1691250056028366, "learning_rate": 0.0001, "loss": 1.5449, "step": 14800 }, { "epoch": 0.40852819807427787, "grad_norm": 0.15044333040714264, "learning_rate": 0.0001, "loss": 1.5454, "step": 14850 }, { "epoch": 0.40990371389270974, "grad_norm": 0.14457371830940247, "learning_rate": 0.0001, "loss": 1.5475, "step": 14900 }, { "epoch": 0.41127922971114167, "grad_norm": 0.15145525336265564, "learning_rate": 0.0001, "loss": 1.5474, "step": 14950 }, { "epoch": 0.4126547455295736, "grad_norm": 0.1273120492696762, "learning_rate": 0.0001, "loss": 1.5446, "step": 15000 }, { "epoch": 0.4140302613480055, "grad_norm": 0.1621488630771637, "learning_rate": 0.0001, "loss": 1.5464, "step": 15050 }, { "epoch": 0.4154057771664374, "grad_norm": 0.1621532440185547, "learning_rate": 0.0001, "loss": 1.5472, "step": 15100 }, { "epoch": 0.4167812929848693, "grad_norm": 0.13030585646629333, "learning_rate": 0.0001, "loss": 1.5416, "step": 15150 }, { "epoch": 0.41815680880330125, "grad_norm": 0.18759876489639282, "learning_rate": 0.0001, "loss": 1.5448, "step": 15200 }, { "epoch": 0.4195323246217332, "grad_norm": 0.12614044547080994, "learning_rate": 0.0001, "loss": 1.5459, "step": 15250 }, { "epoch": 0.42090784044016505, "grad_norm": 0.11533529311418533, "learning_rate": 0.0001, "loss": 1.5446, "step": 15300 }, { "epoch": 0.422283356258597, "grad_norm": 0.1886916160583496, "learning_rate": 0.0001, "loss": 1.5466, "step": 15350 }, { "epoch": 0.4236588720770289, "grad_norm": 0.2204965353012085, "learning_rate": 0.0001, "loss": 1.5436, "step": 15400 }, { "epoch": 0.4250343878954608, "grad_norm": 0.12042222172021866, "learning_rate": 0.0001, "loss": 1.5425, "step": 15450 }, { "epoch": 0.4264099037138927, "grad_norm": 0.135628342628479, "learning_rate": 0.0001, "loss": 1.5464, "step": 15500 }, { "epoch": 0.42778541953232463, "grad_norm": 0.15042053163051605, "learning_rate": 0.0001, "loss": 1.5441, "step": 15550 }, { "epoch": 0.42916093535075656, "grad_norm": 0.1294483244419098, "learning_rate": 0.0001, "loss": 1.5468, "step": 15600 }, { "epoch": 0.4305364511691884, "grad_norm": 0.153069868683815, "learning_rate": 0.0001, "loss": 1.5416, "step": 15650 }, { "epoch": 0.43191196698762035, "grad_norm": 0.129000723361969, "learning_rate": 0.0001, "loss": 1.5434, "step": 15700 }, { "epoch": 0.4332874828060523, "grad_norm": 0.1890910267829895, "learning_rate": 0.0001, "loss": 1.5426, "step": 15750 }, { "epoch": 0.4346629986244842, "grad_norm": 0.14907212555408478, "learning_rate": 0.0001, "loss": 1.5447, "step": 15800 }, { "epoch": 0.4360385144429161, "grad_norm": 0.1549520045518875, "learning_rate": 0.0001, "loss": 1.5438, "step": 15850 }, { "epoch": 0.437414030261348, "grad_norm": 0.1726304590702057, "learning_rate": 0.0001, "loss": 1.5431, "step": 15900 }, { "epoch": 0.43878954607977994, "grad_norm": 0.14929509162902832, "learning_rate": 0.0001, "loss": 1.5408, "step": 15950 }, { "epoch": 0.4401650618982118, "grad_norm": 0.1404862105846405, "learning_rate": 0.0001, "loss": 1.5431, "step": 16000 }, { "epoch": 0.44154057771664373, "grad_norm": 0.1365077942609787, "learning_rate": 0.0001, "loss": 1.5434, "step": 16050 }, { "epoch": 0.44291609353507566, "grad_norm": 0.16866528987884521, "learning_rate": 0.0001, "loss": 1.5425, "step": 16100 }, { "epoch": 0.4442916093535076, "grad_norm": 0.13150258362293243, "learning_rate": 0.0001, "loss": 1.5418, "step": 16150 }, { "epoch": 0.44566712517193946, "grad_norm": 0.17333872616291046, "learning_rate": 0.0001, "loss": 1.5415, "step": 16200 }, { "epoch": 0.4470426409903714, "grad_norm": 0.2110324501991272, "learning_rate": 0.0001, "loss": 1.5434, "step": 16250 }, { "epoch": 0.4484181568088033, "grad_norm": 0.19441699981689453, "learning_rate": 0.0001, "loss": 1.5408, "step": 16300 }, { "epoch": 0.4497936726272352, "grad_norm": 0.1581384241580963, "learning_rate": 0.0001, "loss": 1.5428, "step": 16350 }, { "epoch": 0.4511691884456671, "grad_norm": 0.14479832351207733, "learning_rate": 0.0001, "loss": 1.5444, "step": 16400 }, { "epoch": 0.45254470426409904, "grad_norm": 0.16739803552627563, "learning_rate": 0.0001, "loss": 1.541, "step": 16450 }, { "epoch": 0.45392022008253097, "grad_norm": 0.14801441133022308, "learning_rate": 0.0001, "loss": 1.54, "step": 16500 }, { "epoch": 0.45529573590096284, "grad_norm": 0.13265211880207062, "learning_rate": 0.0001, "loss": 1.5417, "step": 16550 }, { "epoch": 0.45667125171939477, "grad_norm": 0.1164972111582756, "learning_rate": 0.0001, "loss": 1.5411, "step": 16600 }, { "epoch": 0.4580467675378267, "grad_norm": 0.1256764531135559, "learning_rate": 0.0001, "loss": 1.538, "step": 16650 }, { "epoch": 0.4594222833562586, "grad_norm": 0.13301979005336761, "learning_rate": 0.0001, "loss": 1.5409, "step": 16700 }, { "epoch": 0.4607977991746905, "grad_norm": 0.1520063877105713, "learning_rate": 0.0001, "loss": 1.5406, "step": 16750 }, { "epoch": 0.4621733149931224, "grad_norm": 0.12742547690868378, "learning_rate": 0.0001, "loss": 1.5405, "step": 16800 }, { "epoch": 0.46354883081155435, "grad_norm": 0.17311689257621765, "learning_rate": 0.0001, "loss": 1.5416, "step": 16850 }, { "epoch": 0.4649243466299862, "grad_norm": 0.14269371330738068, "learning_rate": 0.0001, "loss": 1.5413, "step": 16900 }, { "epoch": 0.46629986244841815, "grad_norm": 0.14457383751869202, "learning_rate": 0.0001, "loss": 1.5415, "step": 16950 }, { "epoch": 0.4676753782668501, "grad_norm": 0.13189777731895447, "learning_rate": 0.0001, "loss": 1.5388, "step": 17000 }, { "epoch": 0.469050894085282, "grad_norm": 0.16488979756832123, "learning_rate": 0.0001, "loss": 1.5398, "step": 17050 }, { "epoch": 0.47042640990371387, "grad_norm": 0.15953794121742249, "learning_rate": 0.0001, "loss": 1.5387, "step": 17100 }, { "epoch": 0.4718019257221458, "grad_norm": 0.11922045797109604, "learning_rate": 0.0001, "loss": 1.5389, "step": 17150 }, { "epoch": 0.4731774415405777, "grad_norm": 0.13724352419376373, "learning_rate": 0.0001, "loss": 1.5399, "step": 17200 }, { "epoch": 0.47455295735900965, "grad_norm": 0.14968377351760864, "learning_rate": 0.0001, "loss": 1.5419, "step": 17250 }, { "epoch": 0.4759284731774415, "grad_norm": 0.17267867922782898, "learning_rate": 0.0001, "loss": 1.5395, "step": 17300 }, { "epoch": 0.47730398899587345, "grad_norm": 0.14226895570755005, "learning_rate": 0.0001, "loss": 1.5386, "step": 17350 }, { "epoch": 0.4786795048143054, "grad_norm": 0.15129058063030243, "learning_rate": 0.0001, "loss": 1.5424, "step": 17400 }, { "epoch": 0.48005502063273725, "grad_norm": 0.2448931634426117, "learning_rate": 0.0001, "loss": 1.5396, "step": 17450 }, { "epoch": 0.4814305364511692, "grad_norm": 0.2225511074066162, "learning_rate": 0.0001, "loss": 1.5404, "step": 17500 }, { "epoch": 0.4828060522696011, "grad_norm": 0.1891157031059265, "learning_rate": 0.0001, "loss": 1.5394, "step": 17550 }, { "epoch": 0.48418156808803303, "grad_norm": 0.1472170352935791, "learning_rate": 0.0001, "loss": 1.5417, "step": 17600 }, { "epoch": 0.4855570839064649, "grad_norm": 0.1682361215353012, "learning_rate": 0.0001, "loss": 1.5377, "step": 17650 }, { "epoch": 0.48693259972489683, "grad_norm": 0.18433457612991333, "learning_rate": 0.0001, "loss": 1.5396, "step": 17700 }, { "epoch": 0.48830811554332876, "grad_norm": 0.15077999234199524, "learning_rate": 0.0001, "loss": 1.5392, "step": 17750 }, { "epoch": 0.4896836313617607, "grad_norm": 0.16640494763851166, "learning_rate": 0.0001, "loss": 1.5381, "step": 17800 }, { "epoch": 0.49105914718019256, "grad_norm": 0.1587841510772705, "learning_rate": 0.0001, "loss": 1.5386, "step": 17850 }, { "epoch": 0.4924346629986245, "grad_norm": 0.15444575250148773, "learning_rate": 0.0001, "loss": 1.5389, "step": 17900 }, { "epoch": 0.4938101788170564, "grad_norm": 0.18525558710098267, "learning_rate": 0.0001, "loss": 1.5404, "step": 17950 }, { "epoch": 0.4951856946354883, "grad_norm": 0.12790025770664215, "learning_rate": 0.0001, "loss": 1.5394, "step": 18000 }, { "epoch": 0.4965612104539202, "grad_norm": 0.12284336239099503, "learning_rate": 0.0001, "loss": 1.5389, "step": 18050 }, { "epoch": 0.49793672627235214, "grad_norm": 0.12023458629846573, "learning_rate": 0.0001, "loss": 1.5345, "step": 18100 }, { "epoch": 0.49931224209078406, "grad_norm": 0.220647931098938, "learning_rate": 0.0001, "loss": 1.5398, "step": 18150 }, { "epoch": 0.5006877579092159, "grad_norm": 0.1563023179769516, "learning_rate": 0.0001, "loss": 1.5361, "step": 18200 }, { "epoch": 0.5020632737276479, "grad_norm": 0.15485098958015442, "learning_rate": 0.0001, "loss": 1.539, "step": 18250 }, { "epoch": 0.5034387895460798, "grad_norm": 0.21312743425369263, "learning_rate": 0.0001, "loss": 1.5378, "step": 18300 }, { "epoch": 0.5048143053645117, "grad_norm": 0.1381313055753708, "learning_rate": 0.0001, "loss": 1.5396, "step": 18350 }, { "epoch": 0.5061898211829436, "grad_norm": 0.1357322335243225, "learning_rate": 0.0001, "loss": 1.5398, "step": 18400 }, { "epoch": 0.5075653370013755, "grad_norm": 0.16733530163764954, "learning_rate": 0.0001, "loss": 1.5381, "step": 18450 }, { "epoch": 0.5089408528198074, "grad_norm": 0.12985962629318237, "learning_rate": 0.0001, "loss": 1.5391, "step": 18500 }, { "epoch": 0.5103163686382394, "grad_norm": 0.17726540565490723, "learning_rate": 0.0001, "loss": 1.5406, "step": 18550 }, { "epoch": 0.5116918844566712, "grad_norm": 0.1869622766971588, "learning_rate": 0.0001, "loss": 1.5379, "step": 18600 }, { "epoch": 0.5130674002751031, "grad_norm": 0.19111870229244232, "learning_rate": 0.0001, "loss": 1.5373, "step": 18650 }, { "epoch": 0.5144429160935351, "grad_norm": 0.16479162871837616, "learning_rate": 0.0001, "loss": 1.5346, "step": 18700 }, { "epoch": 0.515818431911967, "grad_norm": 0.17092610895633698, "learning_rate": 0.0001, "loss": 1.5387, "step": 18750 }, { "epoch": 0.517193947730399, "grad_norm": 0.1678820550441742, "learning_rate": 0.0001, "loss": 1.5376, "step": 18800 }, { "epoch": 0.5185694635488308, "grad_norm": 0.14618681371212006, "learning_rate": 0.0001, "loss": 1.5353, "step": 18850 }, { "epoch": 0.5199449793672627, "grad_norm": 0.192416712641716, "learning_rate": 0.0001, "loss": 1.54, "step": 18900 }, { "epoch": 0.5213204951856947, "grad_norm": 0.17582687735557556, "learning_rate": 0.0001, "loss": 1.5346, "step": 18950 }, { "epoch": 0.5226960110041265, "grad_norm": 0.19511322677135468, "learning_rate": 0.0001, "loss": 1.5371, "step": 19000 }, { "epoch": 0.5240715268225584, "grad_norm": 0.15874715149402618, "learning_rate": 0.0001, "loss": 1.5362, "step": 19050 }, { "epoch": 0.5254470426409904, "grad_norm": 0.17555968463420868, "learning_rate": 0.0001, "loss": 1.5342, "step": 19100 }, { "epoch": 0.5268225584594223, "grad_norm": 0.17204701900482178, "learning_rate": 0.0001, "loss": 1.5356, "step": 19150 }, { "epoch": 0.5281980742778541, "grad_norm": 0.1334696263074875, "learning_rate": 0.0001, "loss": 1.5378, "step": 19200 }, { "epoch": 0.5295735900962861, "grad_norm": 0.12202008068561554, "learning_rate": 0.0001, "loss": 1.536, "step": 19250 }, { "epoch": 0.530949105914718, "grad_norm": 0.1914770007133484, "learning_rate": 0.0001, "loss": 1.5361, "step": 19300 }, { "epoch": 0.53232462173315, "grad_norm": 0.18114732205867767, "learning_rate": 0.0001, "loss": 1.5391, "step": 19350 }, { "epoch": 0.5337001375515819, "grad_norm": 0.13230808079242706, "learning_rate": 0.0001, "loss": 1.5398, "step": 19400 }, { "epoch": 0.5350756533700137, "grad_norm": 0.24269579350948334, "learning_rate": 0.0001, "loss": 1.535, "step": 19450 }, { "epoch": 0.5364511691884457, "grad_norm": 0.14454102516174316, "learning_rate": 0.0001, "loss": 1.5339, "step": 19500 }, { "epoch": 0.5378266850068776, "grad_norm": 0.17638514935970306, "learning_rate": 0.0001, "loss": 1.5385, "step": 19550 }, { "epoch": 0.5392022008253095, "grad_norm": 0.1496788114309311, "learning_rate": 0.0001, "loss": 1.5389, "step": 19600 }, { "epoch": 0.5405777166437414, "grad_norm": 0.1927812695503235, "learning_rate": 0.0001, "loss": 1.5357, "step": 19650 }, { "epoch": 0.5419532324621733, "grad_norm": 0.1372377574443817, "learning_rate": 0.0001, "loss": 1.5363, "step": 19700 }, { "epoch": 0.5433287482806052, "grad_norm": 0.15738138556480408, "learning_rate": 0.0001, "loss": 1.5358, "step": 19750 }, { "epoch": 0.5447042640990372, "grad_norm": 0.13599953055381775, "learning_rate": 0.0001, "loss": 1.5357, "step": 19800 }, { "epoch": 0.546079779917469, "grad_norm": 0.16571839153766632, "learning_rate": 0.0001, "loss": 1.5343, "step": 19850 }, { "epoch": 0.547455295735901, "grad_norm": 0.14264202117919922, "learning_rate": 0.0001, "loss": 1.5315, "step": 19900 }, { "epoch": 0.5488308115543329, "grad_norm": 0.15331332385540009, "learning_rate": 0.0001, "loss": 1.5344, "step": 19950 }, { "epoch": 0.5502063273727648, "grad_norm": 0.1380966752767563, "learning_rate": 0.0001, "loss": 1.5357, "step": 20000 }, { "epoch": 0.5515818431911967, "grad_norm": 0.198713481426239, "learning_rate": 0.0001, "loss": 1.5323, "step": 20050 }, { "epoch": 0.5529573590096286, "grad_norm": 0.12092329561710358, "learning_rate": 0.0001, "loss": 1.5328, "step": 20100 }, { "epoch": 0.5543328748280605, "grad_norm": 0.13770416378974915, "learning_rate": 0.0001, "loss": 1.5346, "step": 20150 }, { "epoch": 0.5557083906464925, "grad_norm": 0.12443804740905762, "learning_rate": 0.0001, "loss": 1.5312, "step": 20200 }, { "epoch": 0.5570839064649243, "grad_norm": 0.15430398285388947, "learning_rate": 0.0001, "loss": 1.5322, "step": 20250 }, { "epoch": 0.5584594222833562, "grad_norm": 0.1415732502937317, "learning_rate": 0.0001, "loss": 1.5338, "step": 20300 }, { "epoch": 0.5598349381017882, "grad_norm": 0.2753756642341614, "learning_rate": 0.0001, "loss": 1.5329, "step": 20350 }, { "epoch": 0.5612104539202201, "grad_norm": 0.1666756421327591, "learning_rate": 0.0001, "loss": 1.5337, "step": 20400 }, { "epoch": 0.562585969738652, "grad_norm": 0.17720907926559448, "learning_rate": 0.0001, "loss": 1.5312, "step": 20450 }, { "epoch": 0.5639614855570839, "grad_norm": 0.18275785446166992, "learning_rate": 0.0001, "loss": 1.5333, "step": 20500 }, { "epoch": 0.5653370013755158, "grad_norm": 0.20009452104568481, "learning_rate": 0.0001, "loss": 1.5301, "step": 20550 }, { "epoch": 0.5667125171939478, "grad_norm": 0.18812476098537445, "learning_rate": 0.0001, "loss": 1.5332, "step": 20600 }, { "epoch": 0.5680880330123796, "grad_norm": 0.15448282659053802, "learning_rate": 0.0001, "loss": 1.5323, "step": 20650 }, { "epoch": 0.5694635488308115, "grad_norm": 0.1646738499403, "learning_rate": 0.0001, "loss": 1.5335, "step": 20700 }, { "epoch": 0.5708390646492435, "grad_norm": 0.15908415615558624, "learning_rate": 0.0001, "loss": 1.5319, "step": 20750 }, { "epoch": 0.5722145804676754, "grad_norm": 0.15112848579883575, "learning_rate": 0.0001, "loss": 1.5342, "step": 20800 }, { "epoch": 0.5735900962861072, "grad_norm": 0.3316288888454437, "learning_rate": 0.0001, "loss": 1.5344, "step": 20850 }, { "epoch": 0.5749656121045392, "grad_norm": 0.13579101860523224, "learning_rate": 0.0001, "loss": 1.5321, "step": 20900 }, { "epoch": 0.5763411279229711, "grad_norm": 0.2203134000301361, "learning_rate": 0.0001, "loss": 1.5324, "step": 20950 }, { "epoch": 0.5777166437414031, "grad_norm": 0.1271039992570877, "learning_rate": 0.0001, "loss": 1.5328, "step": 21000 }, { "epoch": 0.579092159559835, "grad_norm": 0.3165966272354126, "learning_rate": 0.0001, "loss": 1.5349, "step": 21050 }, { "epoch": 0.5804676753782668, "grad_norm": 0.1456591635942459, "learning_rate": 0.0001, "loss": 1.5343, "step": 21100 }, { "epoch": 0.5818431911966988, "grad_norm": 0.16555163264274597, "learning_rate": 0.0001, "loss": 1.5349, "step": 21150 }, { "epoch": 0.5832187070151307, "grad_norm": 0.22577494382858276, "learning_rate": 0.0001, "loss": 1.5342, "step": 21200 }, { "epoch": 0.5845942228335625, "grad_norm": 0.23455490171909332, "learning_rate": 0.0001, "loss": 1.5346, "step": 21250 }, { "epoch": 0.5859697386519945, "grad_norm": 0.2247081696987152, "learning_rate": 0.0001, "loss": 1.5316, "step": 21300 }, { "epoch": 0.5873452544704264, "grad_norm": 0.15159213542938232, "learning_rate": 0.0001, "loss": 1.534, "step": 21350 }, { "epoch": 0.5887207702888583, "grad_norm": 0.20483700931072235, "learning_rate": 0.0001, "loss": 1.5295, "step": 21400 }, { "epoch": 0.5900962861072903, "grad_norm": 0.16780568659305573, "learning_rate": 0.0001, "loss": 1.5341, "step": 21450 }, { "epoch": 0.5914718019257221, "grad_norm": 0.15840616822242737, "learning_rate": 0.0001, "loss": 1.5339, "step": 21500 }, { "epoch": 0.5928473177441541, "grad_norm": 0.1488318294286728, "learning_rate": 0.0001, "loss": 1.5341, "step": 21550 }, { "epoch": 0.594222833562586, "grad_norm": 0.13899248838424683, "learning_rate": 0.0001, "loss": 1.5323, "step": 21600 }, { "epoch": 0.5955983493810179, "grad_norm": 0.15024836361408234, "learning_rate": 0.0001, "loss": 1.5318, "step": 21650 }, { "epoch": 0.5969738651994498, "grad_norm": 0.19209244847297668, "learning_rate": 0.0001, "loss": 1.5325, "step": 21700 }, { "epoch": 0.5983493810178817, "grad_norm": 0.20580926537513733, "learning_rate": 0.0001, "loss": 1.5324, "step": 21750 }, { "epoch": 0.5997248968363136, "grad_norm": 0.2091200202703476, "learning_rate": 0.0001, "loss": 1.5282, "step": 21800 }, { "epoch": 0.6011004126547456, "grad_norm": 0.1571815311908722, "learning_rate": 0.0001, "loss": 1.532, "step": 21850 }, { "epoch": 0.6024759284731774, "grad_norm": 0.17794279754161835, "learning_rate": 0.0001, "loss": 1.5326, "step": 21900 }, { "epoch": 0.6038514442916093, "grad_norm": 0.1439165472984314, "learning_rate": 0.0001, "loss": 1.5325, "step": 21950 }, { "epoch": 0.6052269601100413, "grad_norm": 0.15884612500667572, "learning_rate": 0.0001, "loss": 1.5329, "step": 22000 }, { "epoch": 0.6066024759284732, "grad_norm": 0.26263782382011414, "learning_rate": 0.0001, "loss": 1.5315, "step": 22050 }, { "epoch": 0.6079779917469051, "grad_norm": 0.19535377621650696, "learning_rate": 0.0001, "loss": 1.5308, "step": 22100 }, { "epoch": 0.609353507565337, "grad_norm": 0.14018963277339935, "learning_rate": 0.0001, "loss": 1.5332, "step": 22150 }, { "epoch": 0.6107290233837689, "grad_norm": 0.15927653014659882, "learning_rate": 0.0001, "loss": 1.5299, "step": 22200 }, { "epoch": 0.6121045392022009, "grad_norm": 0.143597811460495, "learning_rate": 0.0001, "loss": 1.532, "step": 22250 }, { "epoch": 0.6134800550206327, "grad_norm": 0.15887697041034698, "learning_rate": 0.0001, "loss": 1.5313, "step": 22300 }, { "epoch": 0.6148555708390646, "grad_norm": 0.1907578855752945, "learning_rate": 0.0001, "loss": 1.5323, "step": 22350 }, { "epoch": 0.6162310866574966, "grad_norm": 0.189689502120018, "learning_rate": 0.0001, "loss": 1.5319, "step": 22400 }, { "epoch": 0.6176066024759285, "grad_norm": 0.15399134159088135, "learning_rate": 0.0001, "loss": 1.5291, "step": 22450 }, { "epoch": 0.6189821182943603, "grad_norm": 0.16801948845386505, "learning_rate": 0.0001, "loss": 1.5319, "step": 22500 }, { "epoch": 0.6203576341127923, "grad_norm": 0.21341322362422943, "learning_rate": 0.0001, "loss": 1.5311, "step": 22550 }, { "epoch": 0.6217331499312242, "grad_norm": 0.19961433112621307, "learning_rate": 0.0001, "loss": 1.529, "step": 22600 }, { "epoch": 0.6231086657496562, "grad_norm": 0.1254952847957611, "learning_rate": 0.0001, "loss": 1.528, "step": 22650 }, { "epoch": 0.624484181568088, "grad_norm": 0.21346162259578705, "learning_rate": 0.0001, "loss": 1.5323, "step": 22700 }, { "epoch": 0.6258596973865199, "grad_norm": 0.1551300436258316, "learning_rate": 0.0001, "loss": 1.5302, "step": 22750 }, { "epoch": 0.6272352132049519, "grad_norm": 0.1974526047706604, "learning_rate": 0.0001, "loss": 1.5294, "step": 22800 }, { "epoch": 0.6286107290233838, "grad_norm": 0.130974680185318, "learning_rate": 0.0001, "loss": 1.5303, "step": 22850 }, { "epoch": 0.6299862448418156, "grad_norm": 0.17787273228168488, "learning_rate": 0.0001, "loss": 1.5299, "step": 22900 }, { "epoch": 0.6313617606602476, "grad_norm": 0.19317127764225006, "learning_rate": 0.0001, "loss": 1.5295, "step": 22950 }, { "epoch": 0.6327372764786795, "grad_norm": 0.2229757010936737, "learning_rate": 0.0001, "loss": 1.5307, "step": 23000 }, { "epoch": 0.6341127922971114, "grad_norm": 0.17582648992538452, "learning_rate": 0.0001, "loss": 1.5294, "step": 23050 }, { "epoch": 0.6354883081155434, "grad_norm": 0.17122450470924377, "learning_rate": 0.0001, "loss": 1.5291, "step": 23100 }, { "epoch": 0.6368638239339752, "grad_norm": 0.16124916076660156, "learning_rate": 0.0001, "loss": 1.5268, "step": 23150 }, { "epoch": 0.6382393397524071, "grad_norm": 0.18122687935829163, "learning_rate": 0.0001, "loss": 1.5274, "step": 23200 }, { "epoch": 0.6396148555708391, "grad_norm": 0.17480894923210144, "learning_rate": 0.0001, "loss": 1.5276, "step": 23250 }, { "epoch": 0.640990371389271, "grad_norm": 0.1798102855682373, "learning_rate": 0.0001, "loss": 1.5267, "step": 23300 }, { "epoch": 0.6423658872077029, "grad_norm": 0.19186878204345703, "learning_rate": 0.0001, "loss": 1.5294, "step": 23350 }, { "epoch": 0.6437414030261348, "grad_norm": 0.1212744414806366, "learning_rate": 0.0001, "loss": 1.527, "step": 23400 }, { "epoch": 0.6451169188445667, "grad_norm": 0.16844585537910461, "learning_rate": 0.0001, "loss": 1.5265, "step": 23450 }, { "epoch": 0.6464924346629987, "grad_norm": 0.16216999292373657, "learning_rate": 0.0001, "loss": 1.5288, "step": 23500 }, { "epoch": 0.6478679504814305, "grad_norm": 0.157547265291214, "learning_rate": 0.0001, "loss": 1.5298, "step": 23550 }, { "epoch": 0.6492434662998624, "grad_norm": 0.20760610699653625, "learning_rate": 0.0001, "loss": 1.5264, "step": 23600 }, { "epoch": 0.6506189821182944, "grad_norm": 0.19178840517997742, "learning_rate": 0.0001, "loss": 1.5251, "step": 23650 }, { "epoch": 0.6519944979367263, "grad_norm": 0.17904846370220184, "learning_rate": 0.0001, "loss": 1.5293, "step": 23700 }, { "epoch": 0.6533700137551581, "grad_norm": 0.14902061223983765, "learning_rate": 0.0001, "loss": 1.5278, "step": 23750 }, { "epoch": 0.6547455295735901, "grad_norm": 0.1306075155735016, "learning_rate": 0.0001, "loss": 1.5274, "step": 23800 }, { "epoch": 0.656121045392022, "grad_norm": 0.14361289143562317, "learning_rate": 0.0001, "loss": 1.5259, "step": 23850 }, { "epoch": 0.657496561210454, "grad_norm": 0.23775485157966614, "learning_rate": 0.0001, "loss": 1.528, "step": 23900 }, { "epoch": 0.6588720770288858, "grad_norm": 0.12788158655166626, "learning_rate": 0.0001, "loss": 1.5285, "step": 23950 }, { "epoch": 0.6602475928473177, "grad_norm": 0.11719505488872528, "learning_rate": 0.0001, "loss": 1.5275, "step": 24000 }, { "epoch": 0.6616231086657497, "grad_norm": 0.2011108100414276, "learning_rate": 0.0001, "loss": 1.5276, "step": 24050 }, { "epoch": 0.6629986244841816, "grad_norm": 0.16335125267505646, "learning_rate": 0.0001, "loss": 1.5305, "step": 24100 }, { "epoch": 0.6643741403026134, "grad_norm": 0.15488557517528534, "learning_rate": 0.0001, "loss": 1.5259, "step": 24150 }, { "epoch": 0.6657496561210454, "grad_norm": 0.2333500236272812, "learning_rate": 0.0001, "loss": 1.5269, "step": 24200 }, { "epoch": 0.6671251719394773, "grad_norm": 0.14059284329414368, "learning_rate": 0.0001, "loss": 1.5298, "step": 24250 }, { "epoch": 0.6685006877579092, "grad_norm": 0.24036471545696259, "learning_rate": 0.0001, "loss": 1.5274, "step": 24300 }, { "epoch": 0.6698762035763411, "grad_norm": 0.13437625765800476, "learning_rate": 0.0001, "loss": 1.529, "step": 24350 }, { "epoch": 0.671251719394773, "grad_norm": 0.25569766759872437, "learning_rate": 0.0001, "loss": 1.5259, "step": 24400 }, { "epoch": 0.672627235213205, "grad_norm": 0.14324542880058289, "learning_rate": 0.0001, "loss": 1.5286, "step": 24450 }, { "epoch": 0.6740027510316369, "grad_norm": 0.2062855213880539, "learning_rate": 0.0001, "loss": 1.5259, "step": 24500 }, { "epoch": 0.6753782668500687, "grad_norm": 0.18274646997451782, "learning_rate": 0.0001, "loss": 1.5293, "step": 24550 }, { "epoch": 0.6767537826685007, "grad_norm": 0.16611768305301666, "learning_rate": 0.0001, "loss": 1.5283, "step": 24600 }, { "epoch": 0.6781292984869326, "grad_norm": 0.2058711051940918, "learning_rate": 0.0001, "loss": 1.5253, "step": 24650 }, { "epoch": 0.6795048143053645, "grad_norm": 0.16299676895141602, "learning_rate": 0.0001, "loss": 1.5281, "step": 24700 }, { "epoch": 0.6808803301237965, "grad_norm": 0.17875225841999054, "learning_rate": 0.0001, "loss": 1.5266, "step": 24750 }, { "epoch": 0.6822558459422283, "grad_norm": 0.18055297434329987, "learning_rate": 0.0001, "loss": 1.5269, "step": 24800 }, { "epoch": 0.6836313617606602, "grad_norm": 0.22491872310638428, "learning_rate": 0.0001, "loss": 1.5236, "step": 24850 }, { "epoch": 0.6850068775790922, "grad_norm": 0.17760007083415985, "learning_rate": 0.0001, "loss": 1.5249, "step": 24900 }, { "epoch": 0.686382393397524, "grad_norm": 0.19768892228603363, "learning_rate": 0.0001, "loss": 1.5254, "step": 24950 }, { "epoch": 0.687757909215956, "grad_norm": 0.16851931810379028, "learning_rate": 0.0001, "loss": 1.5284, "step": 25000 }, { "epoch": 0.6891334250343879, "grad_norm": 0.16162404417991638, "learning_rate": 0.0001, "loss": 1.5278, "step": 25050 }, { "epoch": 0.6905089408528198, "grad_norm": 0.1808663010597229, "learning_rate": 0.0001, "loss": 1.5239, "step": 25100 }, { "epoch": 0.6918844566712518, "grad_norm": 0.15550534427165985, "learning_rate": 0.0001, "loss": 1.5266, "step": 25150 }, { "epoch": 0.6932599724896836, "grad_norm": 0.22426332533359528, "learning_rate": 0.0001, "loss": 1.5226, "step": 25200 }, { "epoch": 0.6946354883081155, "grad_norm": 0.11868047714233398, "learning_rate": 0.0001, "loss": 1.5256, "step": 25250 }, { "epoch": 0.6960110041265475, "grad_norm": 0.21659235656261444, "learning_rate": 0.0001, "loss": 1.5284, "step": 25300 }, { "epoch": 0.6973865199449794, "grad_norm": 0.1800456941127777, "learning_rate": 0.0001, "loss": 1.5235, "step": 25350 }, { "epoch": 0.6987620357634112, "grad_norm": 0.21043701469898224, "learning_rate": 0.0001, "loss": 1.5275, "step": 25400 }, { "epoch": 0.7001375515818432, "grad_norm": 0.18925617635250092, "learning_rate": 0.0001, "loss": 1.5279, "step": 25450 }, { "epoch": 0.7015130674002751, "grad_norm": 0.1537819653749466, "learning_rate": 0.0001, "loss": 1.5243, "step": 25500 }, { "epoch": 0.7028885832187071, "grad_norm": 0.1832038164138794, "learning_rate": 0.0001, "loss": 1.5255, "step": 25550 }, { "epoch": 0.7042640990371389, "grad_norm": 0.186794713139534, "learning_rate": 0.0001, "loss": 1.5261, "step": 25600 }, { "epoch": 0.7056396148555708, "grad_norm": 0.12374402582645416, "learning_rate": 0.0001, "loss": 1.526, "step": 25650 }, { "epoch": 0.7070151306740028, "grad_norm": 0.16702401638031006, "learning_rate": 0.0001, "loss": 1.5245, "step": 25700 }, { "epoch": 0.7083906464924347, "grad_norm": 0.1393430233001709, "learning_rate": 0.0001, "loss": 1.5254, "step": 25750 }, { "epoch": 0.7097661623108665, "grad_norm": 0.1630173921585083, "learning_rate": 0.0001, "loss": 1.5251, "step": 25800 }, { "epoch": 0.7111416781292985, "grad_norm": 0.1440727412700653, "learning_rate": 0.0001, "loss": 1.5282, "step": 25850 }, { "epoch": 0.7125171939477304, "grad_norm": 0.17978446185588837, "learning_rate": 0.0001, "loss": 1.5262, "step": 25900 }, { "epoch": 0.7138927097661623, "grad_norm": 0.151292085647583, "learning_rate": 0.0001, "loss": 1.527, "step": 25950 }, { "epoch": 0.7152682255845942, "grad_norm": 0.24109718203544617, "learning_rate": 0.0001, "loss": 1.5235, "step": 26000 }, { "epoch": 0.7166437414030261, "grad_norm": 0.15700335800647736, "learning_rate": 0.0001, "loss": 1.5245, "step": 26050 }, { "epoch": 0.7180192572214581, "grad_norm": 0.14807374775409698, "learning_rate": 0.0001, "loss": 1.5224, "step": 26100 }, { "epoch": 0.71939477303989, "grad_norm": 0.13032929599285126, "learning_rate": 0.0001, "loss": 1.5221, "step": 26150 }, { "epoch": 0.7207702888583218, "grad_norm": 0.1900160163640976, "learning_rate": 0.0001, "loss": 1.5259, "step": 26200 }, { "epoch": 0.7221458046767538, "grad_norm": 0.20619365572929382, "learning_rate": 0.0001, "loss": 1.5261, "step": 26250 }, { "epoch": 0.7235213204951857, "grad_norm": 0.17259658873081207, "learning_rate": 0.0001, "loss": 1.5272, "step": 26300 }, { "epoch": 0.7248968363136176, "grad_norm": 0.1594364494085312, "learning_rate": 0.0001, "loss": 1.5242, "step": 26350 }, { "epoch": 0.7262723521320495, "grad_norm": 0.16156145930290222, "learning_rate": 0.0001, "loss": 1.5263, "step": 26400 }, { "epoch": 0.7276478679504814, "grad_norm": 0.15612217783927917, "learning_rate": 0.0001, "loss": 1.5232, "step": 26450 }, { "epoch": 0.7290233837689133, "grad_norm": 0.2097177803516388, "learning_rate": 0.0001, "loss": 1.5265, "step": 26500 }, { "epoch": 0.7303988995873453, "grad_norm": 0.18174001574516296, "learning_rate": 0.0001, "loss": 1.5235, "step": 26550 }, { "epoch": 0.7317744154057771, "grad_norm": 0.15661188960075378, "learning_rate": 0.0001, "loss": 1.5239, "step": 26600 }, { "epoch": 0.7331499312242091, "grad_norm": 0.17666810750961304, "learning_rate": 0.0001, "loss": 1.5244, "step": 26650 }, { "epoch": 0.734525447042641, "grad_norm": 0.135247141122818, "learning_rate": 0.0001, "loss": 1.5228, "step": 26700 }, { "epoch": 0.7359009628610729, "grad_norm": 0.17839883267879486, "learning_rate": 0.0001, "loss": 1.522, "step": 26750 }, { "epoch": 0.7372764786795049, "grad_norm": 0.1601705551147461, "learning_rate": 0.0001, "loss": 1.5258, "step": 26800 }, { "epoch": 0.7386519944979367, "grad_norm": 0.21927671134471893, "learning_rate": 0.0001, "loss": 1.5234, "step": 26850 }, { "epoch": 0.7400275103163686, "grad_norm": 0.18870490789413452, "learning_rate": 0.0001, "loss": 1.5222, "step": 26900 }, { "epoch": 0.7414030261348006, "grad_norm": 0.17285650968551636, "learning_rate": 0.0001, "loss": 1.5243, "step": 26950 }, { "epoch": 0.7427785419532325, "grad_norm": 0.14226007461547852, "learning_rate": 0.0001, "loss": 1.5265, "step": 27000 }, { "epoch": 0.7441540577716643, "grad_norm": 0.17631758749485016, "learning_rate": 0.0001, "loss": 1.5209, "step": 27050 }, { "epoch": 0.7455295735900963, "grad_norm": 0.22787536680698395, "learning_rate": 0.0001, "loss": 1.5233, "step": 27100 }, { "epoch": 0.7469050894085282, "grad_norm": 0.14378662407398224, "learning_rate": 0.0001, "loss": 1.5214, "step": 27150 }, { "epoch": 0.7482806052269602, "grad_norm": 0.21862713992595673, "learning_rate": 0.0001, "loss": 1.5211, "step": 27200 }, { "epoch": 0.749656121045392, "grad_norm": 0.15041618049144745, "learning_rate": 0.0001, "loss": 1.5233, "step": 27250 }, { "epoch": 0.7510316368638239, "grad_norm": 0.15543252229690552, "learning_rate": 0.0001, "loss": 1.5216, "step": 27300 }, { "epoch": 0.7524071526822559, "grad_norm": 0.1488107591867447, "learning_rate": 0.0001, "loss": 1.5237, "step": 27350 }, { "epoch": 0.7537826685006878, "grad_norm": 0.2412855178117752, "learning_rate": 0.0001, "loss": 1.5236, "step": 27400 }, { "epoch": 0.7551581843191196, "grad_norm": 0.21001331508159637, "learning_rate": 0.0001, "loss": 1.5227, "step": 27450 }, { "epoch": 0.7565337001375516, "grad_norm": 0.16884082555770874, "learning_rate": 0.0001, "loss": 1.523, "step": 27500 }, { "epoch": 0.7579092159559835, "grad_norm": 0.1195225790143013, "learning_rate": 0.0001, "loss": 1.5223, "step": 27550 }, { "epoch": 0.7592847317744154, "grad_norm": 0.2539023160934448, "learning_rate": 0.0001, "loss": 1.5223, "step": 27600 }, { "epoch": 0.7606602475928473, "grad_norm": 0.17333871126174927, "learning_rate": 0.0001, "loss": 1.5207, "step": 27650 }, { "epoch": 0.7620357634112792, "grad_norm": 0.14636480808258057, "learning_rate": 0.0001, "loss": 1.5241, "step": 27700 }, { "epoch": 0.7634112792297112, "grad_norm": 0.13305403292179108, "learning_rate": 0.0001, "loss": 1.5224, "step": 27750 }, { "epoch": 0.7647867950481431, "grad_norm": 0.18532030284404755, "learning_rate": 0.0001, "loss": 1.5234, "step": 27800 }, { "epoch": 0.7661623108665749, "grad_norm": 0.1548730880022049, "learning_rate": 0.0001, "loss": 1.5224, "step": 27850 }, { "epoch": 0.7675378266850069, "grad_norm": 0.20586071908473969, "learning_rate": 0.0001, "loss": 1.5219, "step": 27900 }, { "epoch": 0.7689133425034388, "grad_norm": 0.13693679869174957, "learning_rate": 0.0001, "loss": 1.5226, "step": 27950 }, { "epoch": 0.7702888583218707, "grad_norm": 0.17651352286338806, "learning_rate": 0.0001, "loss": 1.5198, "step": 28000 }, { "epoch": 0.7716643741403026, "grad_norm": 0.19794145226478577, "learning_rate": 0.0001, "loss": 1.5243, "step": 28050 }, { "epoch": 0.7730398899587345, "grad_norm": 0.14593897759914398, "learning_rate": 0.0001, "loss": 1.5203, "step": 28100 }, { "epoch": 0.7744154057771664, "grad_norm": 0.18138128519058228, "learning_rate": 0.0001, "loss": 1.5189, "step": 28150 }, { "epoch": 0.7757909215955984, "grad_norm": 0.15987426042556763, "learning_rate": 0.0001, "loss": 1.5209, "step": 28200 }, { "epoch": 0.7771664374140302, "grad_norm": 0.15444040298461914, "learning_rate": 0.0001, "loss": 1.5187, "step": 28250 }, { "epoch": 0.7785419532324622, "grad_norm": 0.22651028633117676, "learning_rate": 0.0001, "loss": 1.5201, "step": 28300 }, { "epoch": 0.7799174690508941, "grad_norm": 0.1889326423406601, "learning_rate": 0.0001, "loss": 1.522, "step": 28350 }, { "epoch": 0.781292984869326, "grad_norm": 0.1659088283777237, "learning_rate": 0.0001, "loss": 1.5211, "step": 28400 }, { "epoch": 0.782668500687758, "grad_norm": 0.20580235123634338, "learning_rate": 0.0001, "loss": 1.5215, "step": 28450 }, { "epoch": 0.7840440165061898, "grad_norm": 0.1748579442501068, "learning_rate": 0.0001, "loss": 1.5199, "step": 28500 }, { "epoch": 0.7854195323246217, "grad_norm": 0.20172914862632751, "learning_rate": 0.0001, "loss": 1.5228, "step": 28550 }, { "epoch": 0.7867950481430537, "grad_norm": 0.1552000194787979, "learning_rate": 0.0001, "loss": 1.5205, "step": 28600 }, { "epoch": 0.7881705639614855, "grad_norm": 0.18557365238666534, "learning_rate": 0.0001, "loss": 1.5234, "step": 28650 }, { "epoch": 0.7895460797799174, "grad_norm": 0.17085815966129303, "learning_rate": 0.0001, "loss": 1.522, "step": 28700 }, { "epoch": 0.7909215955983494, "grad_norm": 0.19171683490276337, "learning_rate": 0.0001, "loss": 1.5187, "step": 28750 }, { "epoch": 0.7922971114167813, "grad_norm": 0.3197721838951111, "learning_rate": 0.0001, "loss": 1.5228, "step": 28800 }, { "epoch": 0.7936726272352133, "grad_norm": 0.21279697120189667, "learning_rate": 0.0001, "loss": 1.5181, "step": 28850 }, { "epoch": 0.7950481430536451, "grad_norm": 0.2184215933084488, "learning_rate": 0.0001, "loss": 1.5214, "step": 28900 }, { "epoch": 0.796423658872077, "grad_norm": 0.21635691821575165, "learning_rate": 0.0001, "loss": 1.523, "step": 28950 }, { "epoch": 0.797799174690509, "grad_norm": 0.15319493412971497, "learning_rate": 0.0001, "loss": 1.5197, "step": 29000 }, { "epoch": 0.7991746905089409, "grad_norm": 0.22083012759685516, "learning_rate": 0.0001, "loss": 1.5219, "step": 29050 }, { "epoch": 0.8005502063273727, "grad_norm": 0.15193097293376923, "learning_rate": 0.0001, "loss": 1.5195, "step": 29100 }, { "epoch": 0.8019257221458047, "grad_norm": 0.19553427398204803, "learning_rate": 0.0001, "loss": 1.5205, "step": 29150 }, { "epoch": 0.8033012379642366, "grad_norm": 0.2117278128862381, "learning_rate": 0.0001, "loss": 1.5203, "step": 29200 }, { "epoch": 0.8046767537826685, "grad_norm": 0.15601006150245667, "learning_rate": 0.0001, "loss": 1.5199, "step": 29250 }, { "epoch": 0.8060522696011004, "grad_norm": 0.15379014611244202, "learning_rate": 0.0001, "loss": 1.5222, "step": 29300 }, { "epoch": 0.8074277854195323, "grad_norm": 0.1712176352739334, "learning_rate": 0.0001, "loss": 1.5204, "step": 29350 }, { "epoch": 0.8088033012379643, "grad_norm": 0.19847099483013153, "learning_rate": 0.0001, "loss": 1.5203, "step": 29400 }, { "epoch": 0.8101788170563962, "grad_norm": 0.15735092759132385, "learning_rate": 0.0001, "loss": 1.5181, "step": 29450 }, { "epoch": 0.811554332874828, "grad_norm": 0.2128709852695465, "learning_rate": 0.0001, "loss": 1.52, "step": 29500 }, { "epoch": 0.81292984869326, "grad_norm": 0.23607073724269867, "learning_rate": 0.0001, "loss": 1.5222, "step": 29550 }, { "epoch": 0.8143053645116919, "grad_norm": 0.15351270139217377, "learning_rate": 0.0001, "loss": 1.5186, "step": 29600 }, { "epoch": 0.8156808803301238, "grad_norm": 0.18421980738639832, "learning_rate": 0.0001, "loss": 1.5189, "step": 29650 }, { "epoch": 0.8170563961485557, "grad_norm": 0.15863709151744843, "learning_rate": 0.0001, "loss": 1.5191, "step": 29700 }, { "epoch": 0.8184319119669876, "grad_norm": 0.1642359048128128, "learning_rate": 0.0001, "loss": 1.5188, "step": 29750 }, { "epoch": 0.8198074277854195, "grad_norm": 0.2115437388420105, "learning_rate": 0.0001, "loss": 1.5193, "step": 29800 }, { "epoch": 0.8211829436038515, "grad_norm": 0.1653752475976944, "learning_rate": 0.0001, "loss": 1.5196, "step": 29850 }, { "epoch": 0.8225584594222833, "grad_norm": 0.25687387585639954, "learning_rate": 0.0001, "loss": 1.5193, "step": 29900 }, { "epoch": 0.8239339752407153, "grad_norm": 0.22497384250164032, "learning_rate": 0.0001, "loss": 1.519, "step": 29950 }, { "epoch": 0.8253094910591472, "grad_norm": 0.16616137325763702, "learning_rate": 0.0001, "loss": 1.5204, "step": 30000 }, { "epoch": 0.8266850068775791, "grad_norm": 0.14630819857120514, "learning_rate": 0.0001, "loss": 1.5208, "step": 30050 }, { "epoch": 0.828060522696011, "grad_norm": 0.19977807998657227, "learning_rate": 0.0001, "loss": 1.5187, "step": 30100 }, { "epoch": 0.8294360385144429, "grad_norm": 0.21963287889957428, "learning_rate": 0.0001, "loss": 1.5181, "step": 30150 }, { "epoch": 0.8308115543328748, "grad_norm": 0.2047349214553833, "learning_rate": 0.0001, "loss": 1.5184, "step": 30200 }, { "epoch": 0.8321870701513068, "grad_norm": 0.1430223435163498, "learning_rate": 0.0001, "loss": 1.5187, "step": 30250 }, { "epoch": 0.8335625859697386, "grad_norm": 0.2075473666191101, "learning_rate": 0.0001, "loss": 1.5185, "step": 30300 }, { "epoch": 0.8349381017881705, "grad_norm": 0.22520440816879272, "learning_rate": 0.0001, "loss": 1.5207, "step": 30350 }, { "epoch": 0.8363136176066025, "grad_norm": 0.2137775719165802, "learning_rate": 0.0001, "loss": 1.5174, "step": 30400 }, { "epoch": 0.8376891334250344, "grad_norm": 0.1777603179216385, "learning_rate": 0.0001, "loss": 1.5189, "step": 30450 }, { "epoch": 0.8390646492434664, "grad_norm": 0.13343022763729095, "learning_rate": 0.0001, "loss": 1.5196, "step": 30500 }, { "epoch": 0.8404401650618982, "grad_norm": 0.223526269197464, "learning_rate": 0.0001, "loss": 1.5201, "step": 30550 }, { "epoch": 0.8418156808803301, "grad_norm": 0.2005707323551178, "learning_rate": 0.0001, "loss": 1.5182, "step": 30600 }, { "epoch": 0.8431911966987621, "grad_norm": 0.1620023101568222, "learning_rate": 0.0001, "loss": 1.5194, "step": 30650 }, { "epoch": 0.844566712517194, "grad_norm": 0.1359826922416687, "learning_rate": 0.0001, "loss": 1.5186, "step": 30700 }, { "epoch": 0.8459422283356258, "grad_norm": 0.23660969734191895, "learning_rate": 0.0001, "loss": 1.5208, "step": 30750 }, { "epoch": 0.8473177441540578, "grad_norm": 0.22223958373069763, "learning_rate": 0.0001, "loss": 1.5167, "step": 30800 }, { "epoch": 0.8486932599724897, "grad_norm": 0.22506959736347198, "learning_rate": 0.0001, "loss": 1.5166, "step": 30850 }, { "epoch": 0.8500687757909215, "grad_norm": 0.20386451482772827, "learning_rate": 0.0001, "loss": 1.5181, "step": 30900 }, { "epoch": 0.8514442916093535, "grad_norm": 0.21547478437423706, "learning_rate": 0.0001, "loss": 1.5184, "step": 30950 }, { "epoch": 0.8528198074277854, "grad_norm": 0.2500711977481842, "learning_rate": 0.0001, "loss": 1.5188, "step": 31000 }, { "epoch": 0.8541953232462174, "grad_norm": 0.17289701104164124, "learning_rate": 0.0001, "loss": 1.5182, "step": 31050 }, { "epoch": 0.8555708390646493, "grad_norm": 0.24792905151844025, "learning_rate": 0.0001, "loss": 1.5201, "step": 31100 }, { "epoch": 0.8569463548830811, "grad_norm": 0.16410884261131287, "learning_rate": 0.0001, "loss": 1.5191, "step": 31150 }, { "epoch": 0.8583218707015131, "grad_norm": 0.20413684844970703, "learning_rate": 0.0001, "loss": 1.5207, "step": 31200 }, { "epoch": 0.859697386519945, "grad_norm": 0.1622382253408432, "learning_rate": 0.0001, "loss": 1.5191, "step": 31250 }, { "epoch": 0.8610729023383769, "grad_norm": 0.19682924449443817, "learning_rate": 0.0001, "loss": 1.5195, "step": 31300 }, { "epoch": 0.8624484181568088, "grad_norm": 0.17585939168930054, "learning_rate": 0.0001, "loss": 1.5182, "step": 31350 }, { "epoch": 0.8638239339752407, "grad_norm": 0.3021407127380371, "learning_rate": 0.0001, "loss": 1.5177, "step": 31400 }, { "epoch": 0.8651994497936726, "grad_norm": 0.25355300307273865, "learning_rate": 0.0001, "loss": 1.5179, "step": 31450 }, { "epoch": 0.8665749656121046, "grad_norm": 0.19390764832496643, "learning_rate": 0.0001, "loss": 1.5146, "step": 31500 }, { "epoch": 0.8679504814305364, "grad_norm": 0.14198362827301025, "learning_rate": 0.0001, "loss": 1.5194, "step": 31550 }, { "epoch": 0.8693259972489684, "grad_norm": 0.21591129899024963, "learning_rate": 0.0001, "loss": 1.516, "step": 31600 }, { "epoch": 0.8707015130674003, "grad_norm": 0.142410010099411, "learning_rate": 0.0001, "loss": 1.5164, "step": 31650 }, { "epoch": 0.8720770288858322, "grad_norm": 0.14241962134838104, "learning_rate": 0.0001, "loss": 1.5144, "step": 31700 }, { "epoch": 0.8734525447042641, "grad_norm": 0.1909308135509491, "learning_rate": 0.0001, "loss": 1.5182, "step": 31750 }, { "epoch": 0.874828060522696, "grad_norm": 0.1649756282567978, "learning_rate": 0.0001, "loss": 1.5145, "step": 31800 }, { "epoch": 0.8762035763411279, "grad_norm": 0.26334628462791443, "learning_rate": 0.0001, "loss": 1.5157, "step": 31850 }, { "epoch": 0.8775790921595599, "grad_norm": 0.1725001484155655, "learning_rate": 0.0001, "loss": 1.5191, "step": 31900 }, { "epoch": 0.8789546079779917, "grad_norm": 0.18799418210983276, "learning_rate": 0.0001, "loss": 1.5171, "step": 31950 }, { "epoch": 0.8803301237964236, "grad_norm": 0.15485192835330963, "learning_rate": 0.0001, "loss": 1.5147, "step": 32000 }, { "epoch": 0.8817056396148556, "grad_norm": 0.13494554162025452, "learning_rate": 0.0001, "loss": 1.5147, "step": 32050 }, { "epoch": 0.8830811554332875, "grad_norm": 0.22909484803676605, "learning_rate": 0.0001, "loss": 1.5154, "step": 32100 }, { "epoch": 0.8844566712517193, "grad_norm": 0.2062431126832962, "learning_rate": 0.0001, "loss": 1.5135, "step": 32150 }, { "epoch": 0.8858321870701513, "grad_norm": 0.17063121497631073, "learning_rate": 0.0001, "loss": 1.517, "step": 32200 }, { "epoch": 0.8872077028885832, "grad_norm": 0.1380726397037506, "learning_rate": 0.0001, "loss": 1.5134, "step": 32250 }, { "epoch": 0.8885832187070152, "grad_norm": 0.18543638288974762, "learning_rate": 0.0001, "loss": 1.5186, "step": 32300 }, { "epoch": 0.889958734525447, "grad_norm": 0.28441041707992554, "learning_rate": 0.0001, "loss": 1.5179, "step": 32350 }, { "epoch": 0.8913342503438789, "grad_norm": 0.2097078114748001, "learning_rate": 0.0001, "loss": 1.518, "step": 32400 }, { "epoch": 0.8927097661623109, "grad_norm": 0.16976235806941986, "learning_rate": 0.0001, "loss": 1.5147, "step": 32450 }, { "epoch": 0.8940852819807428, "grad_norm": 0.20023608207702637, "learning_rate": 0.0001, "loss": 1.5209, "step": 32500 }, { "epoch": 0.8954607977991746, "grad_norm": 0.1981000006198883, "learning_rate": 0.0001, "loss": 1.5161, "step": 32550 }, { "epoch": 0.8968363136176066, "grad_norm": 0.24770237505435944, "learning_rate": 0.0001, "loss": 1.5145, "step": 32600 }, { "epoch": 0.8982118294360385, "grad_norm": 0.27108198404312134, "learning_rate": 0.0001, "loss": 1.5157, "step": 32650 }, { "epoch": 0.8995873452544704, "grad_norm": 0.21742689609527588, "learning_rate": 0.0001, "loss": 1.5176, "step": 32700 }, { "epoch": 0.9009628610729024, "grad_norm": 0.18256455659866333, "learning_rate": 0.0001, "loss": 1.5153, "step": 32750 }, { "epoch": 0.9023383768913342, "grad_norm": 0.1812065690755844, "learning_rate": 0.0001, "loss": 1.517, "step": 32800 }, { "epoch": 0.9037138927097662, "grad_norm": 0.1624094694852829, "learning_rate": 0.0001, "loss": 1.5184, "step": 32850 }, { "epoch": 0.9050894085281981, "grad_norm": 0.12931875884532928, "learning_rate": 0.0001, "loss": 1.5187, "step": 32900 }, { "epoch": 0.90646492434663, "grad_norm": 0.15731951594352722, "learning_rate": 0.0001, "loss": 1.515, "step": 32950 }, { "epoch": 0.9078404401650619, "grad_norm": 0.2222890406847, "learning_rate": 0.0001, "loss": 1.5167, "step": 33000 }, { "epoch": 0.9092159559834938, "grad_norm": 0.33150213956832886, "learning_rate": 0.0001, "loss": 1.5166, "step": 33050 }, { "epoch": 0.9105914718019257, "grad_norm": 0.27547687292099, "learning_rate": 0.0001, "loss": 1.5151, "step": 33100 }, { "epoch": 0.9119669876203577, "grad_norm": 0.1873897761106491, "learning_rate": 0.0001, "loss": 1.5132, "step": 33150 }, { "epoch": 0.9133425034387895, "grad_norm": 0.1707950383424759, "learning_rate": 0.0001, "loss": 1.5149, "step": 33200 }, { "epoch": 0.9147180192572214, "grad_norm": 0.1721598356962204, "learning_rate": 0.0001, "loss": 1.5135, "step": 33250 }, { "epoch": 0.9160935350756534, "grad_norm": 0.31545665860176086, "learning_rate": 0.0001, "loss": 1.5142, "step": 33300 }, { "epoch": 0.9174690508940853, "grad_norm": 0.19677673280239105, "learning_rate": 0.0001, "loss": 1.5114, "step": 33350 }, { "epoch": 0.9188445667125172, "grad_norm": 0.19303210079669952, "learning_rate": 0.0001, "loss": 1.5126, "step": 33400 }, { "epoch": 0.9202200825309491, "grad_norm": 0.14599211513996124, "learning_rate": 0.0001, "loss": 1.5149, "step": 33450 }, { "epoch": 0.921595598349381, "grad_norm": 0.2020881623029709, "learning_rate": 0.0001, "loss": 1.5169, "step": 33500 }, { "epoch": 0.922971114167813, "grad_norm": 0.1755484640598297, "learning_rate": 0.0001, "loss": 1.5146, "step": 33550 }, { "epoch": 0.9243466299862448, "grad_norm": 0.15174026787281036, "learning_rate": 0.0001, "loss": 1.5164, "step": 33600 }, { "epoch": 0.9257221458046767, "grad_norm": 0.21369625627994537, "learning_rate": 0.0001, "loss": 1.5161, "step": 33650 }, { "epoch": 0.9270976616231087, "grad_norm": 0.23643817007541656, "learning_rate": 0.0001, "loss": 1.5129, "step": 33700 }, { "epoch": 0.9284731774415406, "grad_norm": 0.22748377919197083, "learning_rate": 0.0001, "loss": 1.5169, "step": 33750 }, { "epoch": 0.9298486932599724, "grad_norm": 0.24398982524871826, "learning_rate": 0.0001, "loss": 1.5137, "step": 33800 }, { "epoch": 0.9312242090784044, "grad_norm": 0.16090893745422363, "learning_rate": 0.0001, "loss": 1.5126, "step": 33850 }, { "epoch": 0.9325997248968363, "grad_norm": 0.1766052097082138, "learning_rate": 0.0001, "loss": 1.5149, "step": 33900 }, { "epoch": 0.9339752407152683, "grad_norm": 0.15594764053821564, "learning_rate": 0.0001, "loss": 1.5139, "step": 33950 }, { "epoch": 0.9353507565337001, "grad_norm": 0.22842876613140106, "learning_rate": 0.0001, "loss": 1.5152, "step": 34000 }, { "epoch": 0.936726272352132, "grad_norm": 0.17382940649986267, "learning_rate": 0.0001, "loss": 1.5138, "step": 34050 }, { "epoch": 0.938101788170564, "grad_norm": 0.19100262224674225, "learning_rate": 0.0001, "loss": 1.5136, "step": 34100 }, { "epoch": 0.9394773039889959, "grad_norm": 0.13861484825611115, "learning_rate": 0.0001, "loss": 1.5118, "step": 34150 }, { "epoch": 0.9408528198074277, "grad_norm": 0.22483597695827484, "learning_rate": 0.0001, "loss": 1.5119, "step": 34200 }, { "epoch": 0.9422283356258597, "grad_norm": 0.20615430176258087, "learning_rate": 0.0001, "loss": 1.512, "step": 34250 }, { "epoch": 0.9436038514442916, "grad_norm": 0.18101869523525238, "learning_rate": 0.0001, "loss": 1.5142, "step": 34300 }, { "epoch": 0.9449793672627235, "grad_norm": 0.19411496818065643, "learning_rate": 0.0001, "loss": 1.512, "step": 34350 }, { "epoch": 0.9463548830811555, "grad_norm": 0.2966468334197998, "learning_rate": 0.0001, "loss": 1.5121, "step": 34400 }, { "epoch": 0.9477303988995873, "grad_norm": 0.2614442706108093, "learning_rate": 0.0001, "loss": 1.5127, "step": 34450 }, { "epoch": 0.9491059147180193, "grad_norm": 0.3327767252922058, "learning_rate": 0.0001, "loss": 1.5136, "step": 34500 }, { "epoch": 0.9504814305364512, "grad_norm": 0.1958717554807663, "learning_rate": 0.0001, "loss": 1.5133, "step": 34550 }, { "epoch": 0.951856946354883, "grad_norm": 0.15711049735546112, "learning_rate": 0.0001, "loss": 1.5121, "step": 34600 }, { "epoch": 0.953232462173315, "grad_norm": 0.2362435758113861, "learning_rate": 0.0001, "loss": 1.514, "step": 34650 }, { "epoch": 0.9546079779917469, "grad_norm": 0.17552147805690765, "learning_rate": 0.0001, "loss": 1.5115, "step": 34700 }, { "epoch": 0.9559834938101788, "grad_norm": 0.16898372769355774, "learning_rate": 0.0001, "loss": 1.5131, "step": 34750 }, { "epoch": 0.9573590096286108, "grad_norm": 0.18677185475826263, "learning_rate": 0.0001, "loss": 1.5146, "step": 34800 }, { "epoch": 0.9587345254470426, "grad_norm": 0.1758512556552887, "learning_rate": 0.0001, "loss": 1.5141, "step": 34850 }, { "epoch": 0.9601100412654745, "grad_norm": 0.18687918782234192, "learning_rate": 0.0001, "loss": 1.5134, "step": 34900 }, { "epoch": 0.9614855570839065, "grad_norm": 0.2375195175409317, "learning_rate": 0.0001, "loss": 1.5129, "step": 34950 }, { "epoch": 0.9628610729023384, "grad_norm": 0.24082688987255096, "learning_rate": 0.0001, "loss": 1.514, "step": 35000 }, { "epoch": 0.9642365887207703, "grad_norm": 0.2279283106327057, "learning_rate": 0.0001, "loss": 1.5129, "step": 35050 }, { "epoch": 0.9656121045392022, "grad_norm": 0.267251193523407, "learning_rate": 0.0001, "loss": 1.5139, "step": 35100 }, { "epoch": 0.9669876203576341, "grad_norm": 0.1902667135000229, "learning_rate": 0.0001, "loss": 1.5127, "step": 35150 }, { "epoch": 0.9683631361760661, "grad_norm": 0.20134538412094116, "learning_rate": 0.0001, "loss": 1.5137, "step": 35200 }, { "epoch": 0.9697386519944979, "grad_norm": 0.21791616082191467, "learning_rate": 0.0001, "loss": 1.5148, "step": 35250 }, { "epoch": 0.9711141678129298, "grad_norm": 0.2014089673757553, "learning_rate": 0.0001, "loss": 1.5135, "step": 35300 }, { "epoch": 0.9724896836313618, "grad_norm": 0.1704970896244049, "learning_rate": 0.0001, "loss": 1.5148, "step": 35350 }, { "epoch": 0.9738651994497937, "grad_norm": 0.15112122893333435, "learning_rate": 0.0001, "loss": 1.512, "step": 35400 }, { "epoch": 0.9752407152682255, "grad_norm": 0.1649782657623291, "learning_rate": 0.0001, "loss": 1.5107, "step": 35450 }, { "epoch": 0.9766162310866575, "grad_norm": 0.2087404876947403, "learning_rate": 0.0001, "loss": 1.5149, "step": 35500 }, { "epoch": 0.9779917469050894, "grad_norm": 0.2056160867214203, "learning_rate": 0.0001, "loss": 1.511, "step": 35550 }, { "epoch": 0.9793672627235214, "grad_norm": 0.2275388538837433, "learning_rate": 0.0001, "loss": 1.5147, "step": 35600 }, { "epoch": 0.9807427785419532, "grad_norm": 0.24389615654945374, "learning_rate": 0.0001, "loss": 1.5122, "step": 35650 }, { "epoch": 0.9821182943603851, "grad_norm": 0.21413607895374298, "learning_rate": 0.0001, "loss": 1.5119, "step": 35700 }, { "epoch": 0.9834938101788171, "grad_norm": 0.19716958701610565, "learning_rate": 0.0001, "loss": 1.5127, "step": 35750 }, { "epoch": 0.984869325997249, "grad_norm": 0.22444148361682892, "learning_rate": 0.0001, "loss": 1.5128, "step": 35800 }, { "epoch": 0.9862448418156808, "grad_norm": 0.15065211057662964, "learning_rate": 0.0001, "loss": 1.512, "step": 35850 }, { "epoch": 0.9876203576341128, "grad_norm": 0.3378779888153076, "learning_rate": 0.0001, "loss": 1.5108, "step": 35900 }, { "epoch": 0.9889958734525447, "grad_norm": 0.17586860060691833, "learning_rate": 0.0001, "loss": 1.5144, "step": 35950 }, { "epoch": 0.9903713892709766, "grad_norm": 0.270921915769577, "learning_rate": 0.0001, "loss": 1.5142, "step": 36000 }, { "epoch": 0.9917469050894085, "grad_norm": 0.18357771635055542, "learning_rate": 0.0001, "loss": 1.513, "step": 36050 }, { "epoch": 0.9931224209078404, "grad_norm": 0.33356377482414246, "learning_rate": 0.0001, "loss": 1.5129, "step": 36100 }, { "epoch": 0.9944979367262724, "grad_norm": 0.19254672527313232, "learning_rate": 0.0001, "loss": 1.511, "step": 36150 }, { "epoch": 0.9958734525447043, "grad_norm": 0.2596052289009094, "learning_rate": 0.0001, "loss": 1.5113, "step": 36200 }, { "epoch": 0.9972489683631361, "grad_norm": 0.3195280432701111, "learning_rate": 0.0001, "loss": 1.514, "step": 36250 }, { "epoch": 0.9986244841815681, "grad_norm": 0.2321728765964508, "learning_rate": 0.0001, "loss": 1.5121, "step": 36300 }, { "epoch": 1.0, "grad_norm": 0.2551921010017395, "learning_rate": 0.0001, "loss": 1.5127, "step": 36350 } ], "logging_steps": 50, "max_steps": 36350, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3730675995865063e+22, "train_batch_size": 2, "trial_name": null, "trial_params": null }