{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200, "global_step": 292, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00684931506849315, "grad_norm": 0.030539813126927277, "learning_rate": 3.3333333333333333e-06, "loss": 0.1076, "step": 1 }, { "epoch": 0.0136986301369863, "grad_norm": 0.03492658058904307, "learning_rate": 6.666666666666667e-06, "loss": 0.1246, "step": 2 }, { "epoch": 0.02054794520547945, "grad_norm": 0.04865725802068095, "learning_rate": 1e-05, "loss": 0.1586, "step": 3 }, { "epoch": 0.0273972602739726, "grad_norm": 0.05113567210987629, "learning_rate": 1.3333333333333333e-05, "loss": 0.1514, "step": 4 }, { "epoch": 0.03424657534246575, "grad_norm": 0.04332482337715696, "learning_rate": 1.6666666666666667e-05, "loss": 0.1334, "step": 5 }, { "epoch": 0.0410958904109589, "grad_norm": 0.056501517819200174, "learning_rate": 2e-05, "loss": 0.162, "step": 6 }, { "epoch": 0.04794520547945205, "grad_norm": 0.02608028374746435, "learning_rate": 2.3333333333333336e-05, "loss": 0.1031, "step": 7 }, { "epoch": 0.0547945205479452, "grad_norm": 0.03653546691009632, "learning_rate": 2.6666666666666667e-05, "loss": 0.1047, "step": 8 }, { "epoch": 0.06164383561643835, "grad_norm": 0.036654425448601695, "learning_rate": 3e-05, "loss": 0.1063, "step": 9 }, { "epoch": 0.0684931506849315, "grad_norm": 0.07120435117331021, "learning_rate": 3.3333333333333335e-05, "loss": 0.1899, "step": 10 }, { "epoch": 0.07534246575342465, "grad_norm": 0.052020978283898525, "learning_rate": 3.6666666666666666e-05, "loss": 0.1369, "step": 11 }, { "epoch": 0.0821917808219178, "grad_norm": 0.05373108737583589, "learning_rate": 4e-05, "loss": 0.1232, "step": 12 }, { "epoch": 0.08904109589041095, "grad_norm": 0.04869530777109541, "learning_rate": 4.3333333333333334e-05, "loss": 0.1137, "step": 13 }, { "epoch": 0.0958904109589041, "grad_norm": 0.05909792067471564, "learning_rate": 4.666666666666667e-05, "loss": 0.1211, "step": 14 }, { "epoch": 0.10273972602739725, "grad_norm": 0.07782073062487518, "learning_rate": 5e-05, "loss": 0.1419, "step": 15 }, { "epoch": 0.1095890410958904, "grad_norm": 0.10068121599027634, "learning_rate": 5.333333333333333e-05, "loss": 0.1573, "step": 16 }, { "epoch": 0.11643835616438356, "grad_norm": 0.0794018071448985, "learning_rate": 5.666666666666667e-05, "loss": 0.1368, "step": 17 }, { "epoch": 0.1232876712328767, "grad_norm": 0.03925546794880312, "learning_rate": 6e-05, "loss": 0.0936, "step": 18 }, { "epoch": 0.13013698630136986, "grad_norm": 0.09398334893562084, "learning_rate": 6.333333333333333e-05, "loss": 0.1405, "step": 19 }, { "epoch": 0.136986301369863, "grad_norm": 0.03366472291998534, "learning_rate": 6.666666666666667e-05, "loss": 0.0939, "step": 20 }, { "epoch": 0.14383561643835616, "grad_norm": 0.03002583884373123, "learning_rate": 7e-05, "loss": 0.0857, "step": 21 }, { "epoch": 0.1506849315068493, "grad_norm": 0.04526244411827793, "learning_rate": 7.333333333333333e-05, "loss": 0.0932, "step": 22 }, { "epoch": 0.15753424657534246, "grad_norm": 0.0271467232481723, "learning_rate": 7.666666666666667e-05, "loss": 0.0926, "step": 23 }, { "epoch": 0.1643835616438356, "grad_norm": 0.036006533907184064, "learning_rate": 8e-05, "loss": 0.1092, "step": 24 }, { "epoch": 0.17123287671232876, "grad_norm": 0.03819263334586657, "learning_rate": 8.333333333333334e-05, "loss": 0.0967, "step": 25 }, { "epoch": 0.1780821917808219, "grad_norm": 0.04732574961540268, "learning_rate": 8.666666666666667e-05, "loss": 0.0933, "step": 26 }, { "epoch": 0.18493150684931506, "grad_norm": 0.04933244373512878, "learning_rate": 9e-05, "loss": 0.0964, "step": 27 }, { "epoch": 0.1917808219178082, "grad_norm": 0.05164570324383495, "learning_rate": 9.333333333333334e-05, "loss": 0.0893, "step": 28 }, { "epoch": 0.19863013698630136, "grad_norm": 0.06506455363157941, "learning_rate": 9.666666666666667e-05, "loss": 0.1099, "step": 29 }, { "epoch": 0.2054794520547945, "grad_norm": 0.05004738114863798, "learning_rate": 0.0001, "loss": 0.093, "step": 30 }, { "epoch": 0.21232876712328766, "grad_norm": 0.058372907031022096, "learning_rate": 9.999640555396404e-05, "loss": 0.0859, "step": 31 }, { "epoch": 0.2191780821917808, "grad_norm": 0.04214183460112646, "learning_rate": 9.998562273265785e-05, "loss": 0.0757, "step": 32 }, { "epoch": 0.22602739726027396, "grad_norm": 0.045590856190952045, "learning_rate": 9.996765308641218e-05, "loss": 0.0909, "step": 33 }, { "epoch": 0.2328767123287671, "grad_norm": 0.03615919142026033, "learning_rate": 9.994249919886402e-05, "loss": 0.0773, "step": 34 }, { "epoch": 0.23972602739726026, "grad_norm": 0.053734428778710715, "learning_rate": 9.991016468658499e-05, "loss": 0.0804, "step": 35 }, { "epoch": 0.2465753424657534, "grad_norm": 0.03350548900671173, "learning_rate": 9.98706541985615e-05, "loss": 0.0949, "step": 36 }, { "epoch": 0.2534246575342466, "grad_norm": 0.027043847680669336, "learning_rate": 9.98239734155262e-05, "loss": 0.0689, "step": 37 }, { "epoch": 0.2602739726027397, "grad_norm": 0.03089668501975604, "learning_rate": 9.977012904914133e-05, "loss": 0.0896, "step": 38 }, { "epoch": 0.2671232876712329, "grad_norm": 0.029975461306757812, "learning_rate": 9.970912884103364e-05, "loss": 0.0818, "step": 39 }, { "epoch": 0.273972602739726, "grad_norm": 0.029112303333024562, "learning_rate": 9.964098156168142e-05, "loss": 0.0764, "step": 40 }, { "epoch": 0.2808219178082192, "grad_norm": 0.03609528093820637, "learning_rate": 9.956569700915337e-05, "loss": 0.0896, "step": 41 }, { "epoch": 0.2876712328767123, "grad_norm": 0.02923285599907434, "learning_rate": 9.948328600769995e-05, "loss": 0.0687, "step": 42 }, { "epoch": 0.2945205479452055, "grad_norm": 0.03722889238353647, "learning_rate": 9.939376040619705e-05, "loss": 0.0904, "step": 43 }, { "epoch": 0.3013698630136986, "grad_norm": 0.032059757320644165, "learning_rate": 9.929713307644244e-05, "loss": 0.0834, "step": 44 }, { "epoch": 0.3082191780821918, "grad_norm": 0.04817854262074185, "learning_rate": 9.919341791130496e-05, "loss": 0.0913, "step": 45 }, { "epoch": 0.3150684931506849, "grad_norm": 0.027208193309115385, "learning_rate": 9.908262982272724e-05, "loss": 0.0783, "step": 46 }, { "epoch": 0.3219178082191781, "grad_norm": 0.03396529679519977, "learning_rate": 9.896478473958146e-05, "loss": 0.0788, "step": 47 }, { "epoch": 0.3287671232876712, "grad_norm": 0.03703807434511364, "learning_rate": 9.883989960537933e-05, "loss": 0.0737, "step": 48 }, { "epoch": 0.3356164383561644, "grad_norm": 0.037750701903354035, "learning_rate": 9.870799237583587e-05, "loss": 0.0879, "step": 49 }, { "epoch": 0.3424657534246575, "grad_norm": 0.0421005656051737, "learning_rate": 9.85690820162878e-05, "loss": 0.0762, "step": 50 }, { "epoch": 0.3493150684931507, "grad_norm": 0.02733701424368454, "learning_rate": 9.842318849896679e-05, "loss": 0.0716, "step": 51 }, { "epoch": 0.3561643835616438, "grad_norm": 0.027664399030788704, "learning_rate": 9.827033280012783e-05, "loss": 0.0783, "step": 52 }, { "epoch": 0.363013698630137, "grad_norm": 0.030867201947854212, "learning_rate": 9.811053689703334e-05, "loss": 0.0895, "step": 53 }, { "epoch": 0.3698630136986301, "grad_norm": 0.02445456343322928, "learning_rate": 9.794382376479334e-05, "loss": 0.0669, "step": 54 }, { "epoch": 0.3767123287671233, "grad_norm": 0.0355725180623145, "learning_rate": 9.777021737306214e-05, "loss": 0.0758, "step": 55 }, { "epoch": 0.3835616438356164, "grad_norm": 0.02787477968018966, "learning_rate": 9.7589742682592e-05, "loss": 0.0671, "step": 56 }, { "epoch": 0.3904109589041096, "grad_norm": 0.02917250086551862, "learning_rate": 9.740242564164434e-05, "loss": 0.0777, "step": 57 }, { "epoch": 0.3972602739726027, "grad_norm": 0.027367394927186293, "learning_rate": 9.720829318225897e-05, "loss": 0.0784, "step": 58 }, { "epoch": 0.4041095890410959, "grad_norm": 0.03404049628582292, "learning_rate": 9.700737321638185e-05, "loss": 0.0831, "step": 59 }, { "epoch": 0.410958904109589, "grad_norm": 0.03007296844683444, "learning_rate": 9.6799694631852e-05, "loss": 0.0666, "step": 60 }, { "epoch": 0.4178082191780822, "grad_norm": 0.02540452547965842, "learning_rate": 9.6585287288248e-05, "loss": 0.0661, "step": 61 }, { "epoch": 0.4246575342465753, "grad_norm": 0.026781782354160222, "learning_rate": 9.63641820125949e-05, "loss": 0.074, "step": 62 }, { "epoch": 0.4315068493150685, "grad_norm": 0.026783892217477907, "learning_rate": 9.613641059493197e-05, "loss": 0.066, "step": 63 }, { "epoch": 0.4383561643835616, "grad_norm": 0.03124681883629973, "learning_rate": 9.590200578374198e-05, "loss": 0.0704, "step": 64 }, { "epoch": 0.4452054794520548, "grad_norm": 0.03417198666061255, "learning_rate": 9.56610012812427e-05, "loss": 0.0809, "step": 65 }, { "epoch": 0.4520547945205479, "grad_norm": 0.03459134201897623, "learning_rate": 9.541343173854127e-05, "loss": 0.0775, "step": 66 }, { "epoch": 0.4589041095890411, "grad_norm": 0.028846468217507328, "learning_rate": 9.515933275065219e-05, "loss": 0.0683, "step": 67 }, { "epoch": 0.4657534246575342, "grad_norm": 0.02841488216076331, "learning_rate": 9.48987408513794e-05, "loss": 0.0682, "step": 68 }, { "epoch": 0.4726027397260274, "grad_norm": 0.038470702709475786, "learning_rate": 9.463169350806369e-05, "loss": 0.1041, "step": 69 }, { "epoch": 0.4794520547945205, "grad_norm": 0.05495636003850682, "learning_rate": 9.435822911619564e-05, "loss": 0.097, "step": 70 }, { "epoch": 0.4863013698630137, "grad_norm": 0.03142706228563173, "learning_rate": 9.407838699389524e-05, "loss": 0.0844, "step": 71 }, { "epoch": 0.4931506849315068, "grad_norm": 0.02769595486440407, "learning_rate": 9.379220737625877e-05, "loss": 0.0648, "step": 72 }, { "epoch": 0.5, "grad_norm": 0.029889370715744733, "learning_rate": 9.34997314095739e-05, "loss": 0.0668, "step": 73 }, { "epoch": 0.5068493150684932, "grad_norm": 0.036383935533411724, "learning_rate": 9.320100114540382e-05, "loss": 0.0676, "step": 74 }, { "epoch": 0.5136986301369864, "grad_norm": 0.05144060226911856, "learning_rate": 9.289605953454107e-05, "loss": 0.0834, "step": 75 }, { "epoch": 0.5205479452054794, "grad_norm": 0.029231837833789662, "learning_rate": 9.258495042083221e-05, "loss": 0.067, "step": 76 }, { "epoch": 0.5273972602739726, "grad_norm": 0.030385743404608083, "learning_rate": 9.22677185348741e-05, "loss": 0.0809, "step": 77 }, { "epoch": 0.5342465753424658, "grad_norm": 0.031195055928310497, "learning_rate": 9.19444094875825e-05, "loss": 0.0735, "step": 78 }, { "epoch": 0.541095890410959, "grad_norm": 0.029330760873632865, "learning_rate": 9.161506976363437e-05, "loss": 0.0736, "step": 79 }, { "epoch": 0.547945205479452, "grad_norm": 0.0332541385460945, "learning_rate": 9.127974671478432e-05, "loss": 0.0812, "step": 80 }, { "epoch": 0.5547945205479452, "grad_norm": 0.03305866284735081, "learning_rate": 9.093848855305649e-05, "loss": 0.0822, "step": 81 }, { "epoch": 0.5616438356164384, "grad_norm": 0.03432894975247522, "learning_rate": 9.059134434381273e-05, "loss": 0.0783, "step": 82 }, { "epoch": 0.5684931506849316, "grad_norm": 0.039182983364365015, "learning_rate": 9.023836399869814e-05, "loss": 0.0815, "step": 83 }, { "epoch": 0.5753424657534246, "grad_norm": 0.02912164254908111, "learning_rate": 8.98795982684648e-05, "loss": 0.0715, "step": 84 }, { "epoch": 0.5821917808219178, "grad_norm": 0.029959446654787666, "learning_rate": 8.951509873567499e-05, "loss": 0.0754, "step": 85 }, { "epoch": 0.589041095890411, "grad_norm": 0.02955747281592669, "learning_rate": 8.914491780728471e-05, "loss": 0.0683, "step": 86 }, { "epoch": 0.5958904109589042, "grad_norm": 0.03207481946898706, "learning_rate": 8.876910870710884e-05, "loss": 0.0725, "step": 87 }, { "epoch": 0.6027397260273972, "grad_norm": 0.024590915698935577, "learning_rate": 8.838772546816856e-05, "loss": 0.0569, "step": 88 }, { "epoch": 0.6095890410958904, "grad_norm": 0.030750251732919592, "learning_rate": 8.800082292492273e-05, "loss": 0.0732, "step": 89 }, { "epoch": 0.6164383561643836, "grad_norm": 0.032945297540624904, "learning_rate": 8.760845670538387e-05, "loss": 0.0787, "step": 90 }, { "epoch": 0.6232876712328768, "grad_norm": 0.03778017209196359, "learning_rate": 8.721068322312007e-05, "loss": 0.0835, "step": 91 }, { "epoch": 0.6301369863013698, "grad_norm": 0.03507952393071719, "learning_rate": 8.680755966914401e-05, "loss": 0.0785, "step": 92 }, { "epoch": 0.636986301369863, "grad_norm": 0.03248195299567112, "learning_rate": 8.639914400369009e-05, "loss": 0.0769, "step": 93 }, { "epoch": 0.6438356164383562, "grad_norm": 0.03731653007913649, "learning_rate": 8.598549494788111e-05, "loss": 0.0863, "step": 94 }, { "epoch": 0.6506849315068494, "grad_norm": 0.03180106946934432, "learning_rate": 8.556667197528543e-05, "loss": 0.0601, "step": 95 }, { "epoch": 0.6575342465753424, "grad_norm": 0.03375769439746087, "learning_rate": 8.5142735303366e-05, "loss": 0.0833, "step": 96 }, { "epoch": 0.6643835616438356, "grad_norm": 0.03122748644136327, "learning_rate": 8.47137458848224e-05, "loss": 0.0653, "step": 97 }, { "epoch": 0.6712328767123288, "grad_norm": 0.030566834172214723, "learning_rate": 8.427976539882724e-05, "loss": 0.0794, "step": 98 }, { "epoch": 0.678082191780822, "grad_norm": 0.041027373975817134, "learning_rate": 8.384085624215801e-05, "loss": 0.094, "step": 99 }, { "epoch": 0.684931506849315, "grad_norm": 0.02698160042089123, "learning_rate": 8.339708152022585e-05, "loss": 0.06, "step": 100 }, { "epoch": 0.6917808219178082, "grad_norm": 0.026358548855895562, "learning_rate": 8.294850503800238e-05, "loss": 0.0586, "step": 101 }, { "epoch": 0.6986301369863014, "grad_norm": 0.029315218243026478, "learning_rate": 8.24951912908459e-05, "loss": 0.0659, "step": 102 }, { "epoch": 0.7054794520547946, "grad_norm": 0.03730243641514126, "learning_rate": 8.203720545522853e-05, "loss": 0.0768, "step": 103 }, { "epoch": 0.7123287671232876, "grad_norm": 0.028421475360968528, "learning_rate": 8.157461337936506e-05, "loss": 0.0674, "step": 104 }, { "epoch": 0.7191780821917808, "grad_norm": 0.036896447275521424, "learning_rate": 8.110748157374565e-05, "loss": 0.0813, "step": 105 }, { "epoch": 0.726027397260274, "grad_norm": 0.03194136933101962, "learning_rate": 8.063587720157298e-05, "loss": 0.0661, "step": 106 }, { "epoch": 0.7328767123287672, "grad_norm": 0.047606415740070754, "learning_rate": 8.01598680691057e-05, "loss": 0.0922, "step": 107 }, { "epoch": 0.7397260273972602, "grad_norm": 0.03982021001322296, "learning_rate": 7.967952261590935e-05, "loss": 0.0677, "step": 108 }, { "epoch": 0.7465753424657534, "grad_norm": 0.030844190311457055, "learning_rate": 7.919490990501636e-05, "loss": 0.0693, "step": 109 }, { "epoch": 0.7534246575342466, "grad_norm": 0.03128080580199763, "learning_rate": 7.870609961299627e-05, "loss": 0.055, "step": 110 }, { "epoch": 0.7602739726027398, "grad_norm": 0.03042131745977429, "learning_rate": 7.821316201993767e-05, "loss": 0.0696, "step": 111 }, { "epoch": 0.7671232876712328, "grad_norm": 0.03086792896956372, "learning_rate": 7.771616799934371e-05, "loss": 0.0717, "step": 112 }, { "epoch": 0.773972602739726, "grad_norm": 0.03186550616403287, "learning_rate": 7.721518900794185e-05, "loss": 0.0721, "step": 113 }, { "epoch": 0.7808219178082192, "grad_norm": 0.029517902212737826, "learning_rate": 7.67102970754101e-05, "loss": 0.068, "step": 114 }, { "epoch": 0.7876712328767124, "grad_norm": 0.03470530857876834, "learning_rate": 7.620156479402066e-05, "loss": 0.0687, "step": 115 }, { "epoch": 0.7945205479452054, "grad_norm": 0.032497402299353856, "learning_rate": 7.568906530820282e-05, "loss": 0.0749, "step": 116 }, { "epoch": 0.8013698630136986, "grad_norm": 0.03286496346308627, "learning_rate": 7.517287230402639e-05, "loss": 0.0769, "step": 117 }, { "epoch": 0.8082191780821918, "grad_norm": 0.029927285174734974, "learning_rate": 7.465305999860728e-05, "loss": 0.0681, "step": 118 }, { "epoch": 0.815068493150685, "grad_norm": 0.03427786976795219, "learning_rate": 7.412970312943671e-05, "loss": 0.0777, "step": 119 }, { "epoch": 0.821917808219178, "grad_norm": 0.031851819549739036, "learning_rate": 7.360287694363566e-05, "loss": 0.0653, "step": 120 }, { "epoch": 0.8287671232876712, "grad_norm": 0.03126182717380993, "learning_rate": 7.30726571871359e-05, "loss": 0.0741, "step": 121 }, { "epoch": 0.8356164383561644, "grad_norm": 0.03232123010558018, "learning_rate": 7.253912009378953e-05, "loss": 0.0622, "step": 122 }, { "epoch": 0.8424657534246576, "grad_norm": 0.031930228246767285, "learning_rate": 7.200234237440815e-05, "loss": 0.0711, "step": 123 }, { "epoch": 0.8493150684931506, "grad_norm": 0.03869729803850467, "learning_rate": 7.146240120573358e-05, "loss": 0.0748, "step": 124 }, { "epoch": 0.8561643835616438, "grad_norm": 0.04668355562414557, "learning_rate": 7.091937421934157e-05, "loss": 0.0719, "step": 125 }, { "epoch": 0.863013698630137, "grad_norm": 0.045196133927728935, "learning_rate": 7.037333949048005e-05, "loss": 0.0801, "step": 126 }, { "epoch": 0.8698630136986302, "grad_norm": 0.03140632320538256, "learning_rate": 6.98243755268437e-05, "loss": 0.0626, "step": 127 }, { "epoch": 0.8767123287671232, "grad_norm": 0.052496609882496145, "learning_rate": 6.927256125728624e-05, "loss": 0.0777, "step": 128 }, { "epoch": 0.8835616438356164, "grad_norm": 0.03868077359004108, "learning_rate": 6.87179760204722e-05, "loss": 0.0791, "step": 129 }, { "epoch": 0.8904109589041096, "grad_norm": 0.0348172699895961, "learning_rate": 6.816069955346985e-05, "loss": 0.0743, "step": 130 }, { "epoch": 0.8972602739726028, "grad_norm": 0.029728370506506084, "learning_rate": 6.760081198028671e-05, "loss": 0.062, "step": 131 }, { "epoch": 0.9041095890410958, "grad_norm": 0.033304716743838475, "learning_rate": 6.703839380034946e-05, "loss": 0.0742, "step": 132 }, { "epoch": 0.910958904109589, "grad_norm": 0.03402220621737934, "learning_rate": 6.647352587693001e-05, "loss": 0.074, "step": 133 }, { "epoch": 0.9178082191780822, "grad_norm": 0.03029297413232278, "learning_rate": 6.590628942551909e-05, "loss": 0.0706, "step": 134 }, { "epoch": 0.9246575342465754, "grad_norm": 0.03726620583459937, "learning_rate": 6.533676600214928e-05, "loss": 0.0711, "step": 135 }, { "epoch": 0.9315068493150684, "grad_norm": 0.030791726162161644, "learning_rate": 6.476503749166904e-05, "loss": 0.0844, "step": 136 }, { "epoch": 0.9383561643835616, "grad_norm": 0.03101593164912545, "learning_rate": 6.419118609596948e-05, "loss": 0.0743, "step": 137 }, { "epoch": 0.9452054794520548, "grad_norm": 0.037483812819514725, "learning_rate": 6.361529432216559e-05, "loss": 0.0888, "step": 138 }, { "epoch": 0.952054794520548, "grad_norm": 0.034654276627226706, "learning_rate": 6.303744497073352e-05, "loss": 0.0662, "step": 139 }, { "epoch": 0.958904109589041, "grad_norm": 0.10049031991880263, "learning_rate": 6.245772112360568e-05, "loss": 0.0817, "step": 140 }, { "epoch": 0.9657534246575342, "grad_norm": 0.03537251059587243, "learning_rate": 6.187620613222544e-05, "loss": 0.0768, "step": 141 }, { "epoch": 0.9726027397260274, "grad_norm": 0.04616032239868132, "learning_rate": 6.129298360556304e-05, "loss": 0.0802, "step": 142 }, { "epoch": 0.9794520547945206, "grad_norm": 0.03412880356569236, "learning_rate": 6.070813739809442e-05, "loss": 0.0574, "step": 143 }, { "epoch": 0.9863013698630136, "grad_norm": 0.02909505969006464, "learning_rate": 6.012175159774488e-05, "loss": 0.0604, "step": 144 }, { "epoch": 0.9931506849315068, "grad_norm": 0.03935910262269513, "learning_rate": 5.953391051379904e-05, "loss": 0.0584, "step": 145 }, { "epoch": 1.0, "grad_norm": 0.03397983912672022, "learning_rate": 5.894469866477905e-05, "loss": 0.0577, "step": 146 }, { "epoch": 1.0068493150684932, "grad_norm": 0.0311452791456704, "learning_rate": 5.8354200766292734e-05, "loss": 0.079, "step": 147 }, { "epoch": 1.0136986301369864, "grad_norm": 0.033811357449414096, "learning_rate": 5.776250171885329e-05, "loss": 0.0656, "step": 148 }, { "epoch": 1.0205479452054795, "grad_norm": 0.040640212988149904, "learning_rate": 5.716968659567256e-05, "loss": 0.0728, "step": 149 }, { "epoch": 1.0273972602739727, "grad_norm": 0.03057953268248259, "learning_rate": 5.6575840630429286e-05, "loss": 0.0733, "step": 150 }, { "epoch": 1.0342465753424657, "grad_norm": 0.032703562266556005, "learning_rate": 5.5981049205014546e-05, "loss": 0.0712, "step": 151 }, { "epoch": 1.0410958904109588, "grad_norm": 0.03210664170957404, "learning_rate": 5.5385397837255556e-05, "loss": 0.0691, "step": 152 }, { "epoch": 1.047945205479452, "grad_norm": 0.03471378475737469, "learning_rate": 5.4788972168620255e-05, "loss": 0.0757, "step": 153 }, { "epoch": 1.0547945205479452, "grad_norm": 0.03165599022030969, "learning_rate": 5.4191857951903826e-05, "loss": 0.0597, "step": 154 }, { "epoch": 1.0616438356164384, "grad_norm": 0.05451608731514093, "learning_rate": 5.359414103889947e-05, "loss": 0.0894, "step": 155 }, { "epoch": 1.0684931506849316, "grad_norm": 0.04358798536487321, "learning_rate": 5.29959073680547e-05, "loss": 0.0782, "step": 156 }, { "epoch": 1.0753424657534247, "grad_norm": 0.032277506655898194, "learning_rate": 5.239724295211541e-05, "loss": 0.0704, "step": 157 }, { "epoch": 1.0821917808219177, "grad_norm": 0.028670588244485373, "learning_rate": 5.179823386575907e-05, "loss": 0.0507, "step": 158 }, { "epoch": 1.0890410958904109, "grad_norm": 0.03036746716980511, "learning_rate": 5.119896623321909e-05, "loss": 0.0527, "step": 159 }, { "epoch": 1.095890410958904, "grad_norm": 0.03329805529241328, "learning_rate": 5.059952621590216e-05, "loss": 0.0623, "step": 160 }, { "epoch": 1.1027397260273972, "grad_norm": 0.037603460102786174, "learning_rate": 5e-05, "loss": 0.0695, "step": 161 }, { "epoch": 1.1095890410958904, "grad_norm": 0.04120763652112312, "learning_rate": 4.940047378409786e-05, "loss": 0.0711, "step": 162 }, { "epoch": 1.1164383561643836, "grad_norm": 0.0415019727955413, "learning_rate": 4.880103376678092e-05, "loss": 0.0638, "step": 163 }, { "epoch": 1.1232876712328768, "grad_norm": 0.032850408356261955, "learning_rate": 4.820176613424095e-05, "loss": 0.0502, "step": 164 }, { "epoch": 1.13013698630137, "grad_norm": 0.051682446059570064, "learning_rate": 4.7602757047884595e-05, "loss": 0.0658, "step": 165 }, { "epoch": 1.1369863013698631, "grad_norm": 0.03126729469626143, "learning_rate": 4.700409263194531e-05, "loss": 0.0506, "step": 166 }, { "epoch": 1.143835616438356, "grad_norm": 0.03940644676794397, "learning_rate": 4.640585896110054e-05, "loss": 0.0632, "step": 167 }, { "epoch": 1.1506849315068493, "grad_norm": 0.04126874600149052, "learning_rate": 4.580814204809618e-05, "loss": 0.0813, "step": 168 }, { "epoch": 1.1575342465753424, "grad_norm": 0.04156579483287442, "learning_rate": 4.5211027831379757e-05, "loss": 0.0652, "step": 169 }, { "epoch": 1.1643835616438356, "grad_norm": 0.03391355375732456, "learning_rate": 4.461460216274445e-05, "loss": 0.057, "step": 170 }, { "epoch": 1.1712328767123288, "grad_norm": 0.038632199419011234, "learning_rate": 4.401895079498547e-05, "loss": 0.0661, "step": 171 }, { "epoch": 1.178082191780822, "grad_norm": 0.03708167445939923, "learning_rate": 4.3424159369570725e-05, "loss": 0.0624, "step": 172 }, { "epoch": 1.1849315068493151, "grad_norm": 0.03753946698450887, "learning_rate": 4.283031340432747e-05, "loss": 0.0612, "step": 173 }, { "epoch": 1.191780821917808, "grad_norm": 0.03537905440706239, "learning_rate": 4.223749828114672e-05, "loss": 0.0673, "step": 174 }, { "epoch": 1.1986301369863013, "grad_norm": 0.03940327075341433, "learning_rate": 4.1645799233707284e-05, "loss": 0.0607, "step": 175 }, { "epoch": 1.2054794520547945, "grad_norm": 0.041138452676237995, "learning_rate": 4.1055301335220955e-05, "loss": 0.0757, "step": 176 }, { "epoch": 1.2123287671232876, "grad_norm": 0.03578703324326037, "learning_rate": 4.0466089486200976e-05, "loss": 0.0707, "step": 177 }, { "epoch": 1.2191780821917808, "grad_norm": 0.039832365736742495, "learning_rate": 3.987824840225512e-05, "loss": 0.071, "step": 178 }, { "epoch": 1.226027397260274, "grad_norm": 0.038723411367421474, "learning_rate": 3.9291862601905595e-05, "loss": 0.0728, "step": 179 }, { "epoch": 1.2328767123287672, "grad_norm": 0.04498308725984581, "learning_rate": 3.870701639443698e-05, "loss": 0.0703, "step": 180 }, { "epoch": 1.2397260273972603, "grad_norm": 0.031865141682931065, "learning_rate": 3.812379386777457e-05, "loss": 0.0523, "step": 181 }, { "epoch": 1.2465753424657535, "grad_norm": 0.038277951507247616, "learning_rate": 3.7542278876394336e-05, "loss": 0.06, "step": 182 }, { "epoch": 1.2534246575342465, "grad_norm": 0.0650327431598633, "learning_rate": 3.696255502926649e-05, "loss": 0.0586, "step": 183 }, { "epoch": 1.2602739726027397, "grad_norm": 0.041491341553849696, "learning_rate": 3.638470567783442e-05, "loss": 0.0741, "step": 184 }, { "epoch": 1.2671232876712328, "grad_norm": 0.03898025654685055, "learning_rate": 3.580881390403052e-05, "loss": 0.07, "step": 185 }, { "epoch": 1.273972602739726, "grad_norm": 0.03741545061342083, "learning_rate": 3.5234962508330974e-05, "loss": 0.0648, "step": 186 }, { "epoch": 1.2808219178082192, "grad_norm": 0.03579074437335611, "learning_rate": 3.466323399785072e-05, "loss": 0.0519, "step": 187 }, { "epoch": 1.2876712328767124, "grad_norm": 0.03622522476818763, "learning_rate": 3.409371057448092e-05, "loss": 0.0581, "step": 188 }, { "epoch": 1.2945205479452055, "grad_norm": 0.05446559861941158, "learning_rate": 3.352647412307002e-05, "loss": 0.0714, "step": 189 }, { "epoch": 1.3013698630136985, "grad_norm": 0.03832700455471586, "learning_rate": 3.296160619965056e-05, "loss": 0.055, "step": 190 }, { "epoch": 1.308219178082192, "grad_norm": 0.03896050037792126, "learning_rate": 3.239918801971332e-05, "loss": 0.062, "step": 191 }, { "epoch": 1.3150684931506849, "grad_norm": 0.03267778522774074, "learning_rate": 3.183930044653014e-05, "loss": 0.0546, "step": 192 }, { "epoch": 1.321917808219178, "grad_norm": 0.04339482297121572, "learning_rate": 3.1282023979527805e-05, "loss": 0.0676, "step": 193 }, { "epoch": 1.3287671232876712, "grad_norm": 0.04149269813272275, "learning_rate": 3.072743874271376e-05, "loss": 0.0576, "step": 194 }, { "epoch": 1.3356164383561644, "grad_norm": 0.05669408827237824, "learning_rate": 3.0175624473156316e-05, "loss": 0.0559, "step": 195 }, { "epoch": 1.3424657534246576, "grad_norm": 0.04316637345791664, "learning_rate": 2.962666050951997e-05, "loss": 0.067, "step": 196 }, { "epoch": 1.3493150684931507, "grad_norm": 0.04179017639942075, "learning_rate": 2.9080625780658455e-05, "loss": 0.0585, "step": 197 }, { "epoch": 1.356164383561644, "grad_norm": 0.04086829863785544, "learning_rate": 2.853759879426644e-05, "loss": 0.0637, "step": 198 }, { "epoch": 1.3630136986301369, "grad_norm": 0.045447600350027706, "learning_rate": 2.7997657625591867e-05, "loss": 0.0578, "step": 199 }, { "epoch": 1.36986301369863, "grad_norm": 0.04063023817685305, "learning_rate": 2.7460879906210487e-05, "loss": 0.0704, "step": 200 }, { "epoch": 1.36986301369863, "eval_loss": 0.07383698970079422, "eval_runtime": 6.4837, "eval_samples_per_second": 0.925, "eval_steps_per_second": 0.308, "step": 200 }, { "epoch": 1.3767123287671232, "grad_norm": 0.043841411312286985, "learning_rate": 2.6927342812864116e-05, "loss": 0.0667, "step": 201 }, { "epoch": 1.3835616438356164, "grad_norm": 0.04301017602851657, "learning_rate": 2.6397123056364365e-05, "loss": 0.0726, "step": 202 }, { "epoch": 1.3904109589041096, "grad_norm": 0.042839496066386186, "learning_rate": 2.5870296870563286e-05, "loss": 0.0602, "step": 203 }, { "epoch": 1.3972602739726028, "grad_norm": 0.03829564126132437, "learning_rate": 2.5346940001392728e-05, "loss": 0.0645, "step": 204 }, { "epoch": 1.404109589041096, "grad_norm": 0.03507625071131985, "learning_rate": 2.482712769597363e-05, "loss": 0.0543, "step": 205 }, { "epoch": 1.410958904109589, "grad_norm": 0.03743875489488463, "learning_rate": 2.4310934691797203e-05, "loss": 0.0602, "step": 206 }, { "epoch": 1.4178082191780823, "grad_norm": 0.037325448164000144, "learning_rate": 2.379843520597937e-05, "loss": 0.056, "step": 207 }, { "epoch": 1.4246575342465753, "grad_norm": 0.04519550162385289, "learning_rate": 2.3289702924589914e-05, "loss": 0.0823, "step": 208 }, { "epoch": 1.4315068493150684, "grad_norm": 0.041415718432946946, "learning_rate": 2.2784810992058154e-05, "loss": 0.069, "step": 209 }, { "epoch": 1.4383561643835616, "grad_norm": 0.04615134339463282, "learning_rate": 2.22838320006563e-05, "loss": 0.0709, "step": 210 }, { "epoch": 1.4452054794520548, "grad_norm": 0.03508689289745425, "learning_rate": 2.1786837980062342e-05, "loss": 0.0562, "step": 211 }, { "epoch": 1.452054794520548, "grad_norm": 0.042082412191430475, "learning_rate": 2.129390038700374e-05, "loss": 0.0711, "step": 212 }, { "epoch": 1.4589041095890412, "grad_norm": 0.04921460988289041, "learning_rate": 2.0805090094983636e-05, "loss": 0.066, "step": 213 }, { "epoch": 1.4657534246575343, "grad_norm": 0.03476195821739949, "learning_rate": 2.0320477384090665e-05, "loss": 0.0516, "step": 214 }, { "epoch": 1.4726027397260273, "grad_norm": 0.03974475045265387, "learning_rate": 1.9840131930894333e-05, "loss": 0.0507, "step": 215 }, { "epoch": 1.4794520547945205, "grad_norm": 0.04495458260995571, "learning_rate": 1.936412279842705e-05, "loss": 0.071, "step": 216 }, { "epoch": 1.4863013698630136, "grad_norm": 0.032618789951219244, "learning_rate": 1.8892518426254364e-05, "loss": 0.0487, "step": 217 }, { "epoch": 1.4931506849315068, "grad_norm": 0.05233338159738159, "learning_rate": 1.842538662063496e-05, "loss": 0.0779, "step": 218 }, { "epoch": 1.5, "grad_norm": 0.043093246770756745, "learning_rate": 1.7962794544771477e-05, "loss": 0.0731, "step": 219 }, { "epoch": 1.5068493150684932, "grad_norm": 0.04778965621357637, "learning_rate": 1.7504808709154104e-05, "loss": 0.0767, "step": 220 }, { "epoch": 1.5136986301369864, "grad_norm": 0.04225455780801827, "learning_rate": 1.705149496199762e-05, "loss": 0.065, "step": 221 }, { "epoch": 1.5205479452054793, "grad_norm": 0.03777201213330916, "learning_rate": 1.6602918479774148e-05, "loss": 0.0587, "step": 222 }, { "epoch": 1.5273972602739727, "grad_norm": 0.050670841355552755, "learning_rate": 1.6159143757842004e-05, "loss": 0.0694, "step": 223 }, { "epoch": 1.5342465753424657, "grad_norm": 0.04169608777354125, "learning_rate": 1.5720234601172766e-05, "loss": 0.0622, "step": 224 }, { "epoch": 1.541095890410959, "grad_norm": 0.0399325959403793, "learning_rate": 1.5286254115177623e-05, "loss": 0.0559, "step": 225 }, { "epoch": 1.547945205479452, "grad_norm": 0.04347771997206968, "learning_rate": 1.485726469663401e-05, "loss": 0.0625, "step": 226 }, { "epoch": 1.5547945205479452, "grad_norm": 0.04329191462577287, "learning_rate": 1.4433328024714581e-05, "loss": 0.0682, "step": 227 }, { "epoch": 1.5616438356164384, "grad_norm": 0.045935945373833685, "learning_rate": 1.4014505052118892e-05, "loss": 0.063, "step": 228 }, { "epoch": 1.5684931506849316, "grad_norm": 0.03953171472704261, "learning_rate": 1.3600855996309935e-05, "loss": 0.0621, "step": 229 }, { "epoch": 1.5753424657534247, "grad_norm": 0.042842743444326405, "learning_rate": 1.3192440330856004e-05, "loss": 0.0584, "step": 230 }, { "epoch": 1.5821917808219177, "grad_norm": 0.04147136350311881, "learning_rate": 1.2789316776879939e-05, "loss": 0.0571, "step": 231 }, { "epoch": 1.589041095890411, "grad_norm": 0.041096032631508496, "learning_rate": 1.2391543294616147e-05, "loss": 0.0569, "step": 232 }, { "epoch": 1.595890410958904, "grad_norm": 0.04826360135309353, "learning_rate": 1.1999177075077278e-05, "loss": 0.0749, "step": 233 }, { "epoch": 1.6027397260273972, "grad_norm": 0.034935721211031806, "learning_rate": 1.1612274531831463e-05, "loss": 0.0514, "step": 234 }, { "epoch": 1.6095890410958904, "grad_norm": 0.04282701283898458, "learning_rate": 1.123089129289117e-05, "loss": 0.0671, "step": 235 }, { "epoch": 1.6164383561643836, "grad_norm": 0.05961503389381486, "learning_rate": 1.0855082192715294e-05, "loss": 0.0665, "step": 236 }, { "epoch": 1.6232876712328768, "grad_norm": 0.06432557444245549, "learning_rate": 1.0484901264325025e-05, "loss": 0.0755, "step": 237 }, { "epoch": 1.6301369863013697, "grad_norm": 0.04253716073284135, "learning_rate": 1.0120401731535212e-05, "loss": 0.0733, "step": 238 }, { "epoch": 1.6369863013698631, "grad_norm": 0.0774475760401046, "learning_rate": 9.761636001301871e-06, "loss": 0.065, "step": 239 }, { "epoch": 1.643835616438356, "grad_norm": 0.03996033149580056, "learning_rate": 9.408655656187282e-06, "loss": 0.065, "step": 240 }, { "epoch": 1.6506849315068495, "grad_norm": 0.03785205141208416, "learning_rate": 9.061511446943533e-06, "loss": 0.0548, "step": 241 }, { "epoch": 1.6575342465753424, "grad_norm": 0.03850363195978434, "learning_rate": 8.720253285215685e-06, "loss": 0.0587, "step": 242 }, { "epoch": 1.6643835616438356, "grad_norm": 0.0469269115044975, "learning_rate": 8.384930236365629e-06, "loss": 0.0634, "step": 243 }, { "epoch": 1.6712328767123288, "grad_norm": 0.0498361427852067, "learning_rate": 8.0555905124175e-06, "loss": 0.051, "step": 244 }, { "epoch": 1.678082191780822, "grad_norm": 0.04702037163211539, "learning_rate": 7.732281465125907e-06, "loss": 0.0677, "step": 245 }, { "epoch": 1.6849315068493151, "grad_norm": 0.04724045679820981, "learning_rate": 7.415049579167782e-06, "loss": 0.0731, "step": 246 }, { "epoch": 1.691780821917808, "grad_norm": 0.04960546509274232, "learning_rate": 7.103940465458936e-06, "loss": 0.0777, "step": 247 }, { "epoch": 1.6986301369863015, "grad_norm": 0.04749987160293405, "learning_rate": 6.798998854596189e-06, "loss": 0.0531, "step": 248 }, { "epoch": 1.7054794520547945, "grad_norm": 0.04656027534681338, "learning_rate": 6.500268590426106e-06, "loss": 0.0547, "step": 249 }, { "epoch": 1.7123287671232876, "grad_norm": 0.043116663189771226, "learning_rate": 6.207792623741249e-06, "loss": 0.0677, "step": 250 }, { "epoch": 1.7191780821917808, "grad_norm": 0.04714594190332252, "learning_rate": 5.9216130061047646e-06, "loss": 0.068, "step": 251 }, { "epoch": 1.726027397260274, "grad_norm": 0.0442482453639658, "learning_rate": 5.641770883804365e-06, "loss": 0.0602, "step": 252 }, { "epoch": 1.7328767123287672, "grad_norm": 0.044456632129314805, "learning_rate": 5.368306491936325e-06, "loss": 0.0661, "step": 253 }, { "epoch": 1.7397260273972601, "grad_norm": 0.04864641601498424, "learning_rate": 5.101259148620619e-06, "loss": 0.0676, "step": 254 }, { "epoch": 1.7465753424657535, "grad_norm": 0.043784015189862524, "learning_rate": 4.840667249347824e-06, "loss": 0.0669, "step": 255 }, { "epoch": 1.7534246575342465, "grad_norm": 0.043216466288645235, "learning_rate": 4.586568261458729e-06, "loss": 0.0644, "step": 256 }, { "epoch": 1.7602739726027399, "grad_norm": 0.042340118838538034, "learning_rate": 4.3389987187573145e-06, "loss": 0.053, "step": 257 }, { "epoch": 1.7671232876712328, "grad_norm": 0.03965234038836836, "learning_rate": 4.097994216258039e-06, "loss": 0.0528, "step": 258 }, { "epoch": 1.773972602739726, "grad_norm": 0.04330912545003313, "learning_rate": 3.8635894050680466e-06, "loss": 0.062, "step": 259 }, { "epoch": 1.7808219178082192, "grad_norm": 0.04245993591175113, "learning_rate": 3.63581798740511e-06, "loss": 0.0682, "step": 260 }, { "epoch": 1.7876712328767124, "grad_norm": 0.046454610509872485, "learning_rate": 3.4147127117520104e-06, "loss": 0.0773, "step": 261 }, { "epoch": 1.7945205479452055, "grad_norm": 0.061061675638647096, "learning_rate": 3.2003053681480098e-06, "loss": 0.0628, "step": 262 }, { "epoch": 1.8013698630136985, "grad_norm": 0.04251403620622773, "learning_rate": 2.992626783618152e-06, "loss": 0.0506, "step": 263 }, { "epoch": 1.808219178082192, "grad_norm": 0.04783592253659306, "learning_rate": 2.791706817741041e-06, "loss": 0.0737, "step": 264 }, { "epoch": 1.8150684931506849, "grad_norm": 0.04359969004920878, "learning_rate": 2.59757435835567e-06, "loss": 0.0599, "step": 265 }, { "epoch": 1.821917808219178, "grad_norm": 0.044487631888004406, "learning_rate": 2.41025731740801e-06, "loss": 0.0692, "step": 266 }, { "epoch": 1.8287671232876712, "grad_norm": 0.04510155943713286, "learning_rate": 2.229782626937865e-06, "loss": 0.0633, "step": 267 }, { "epoch": 1.8356164383561644, "grad_norm": 0.04035438478741115, "learning_rate": 2.056176235206664e-06, "loss": 0.0601, "step": 268 }, { "epoch": 1.8424657534246576, "grad_norm": 0.046645570169377086, "learning_rate": 1.889463102966671e-06, "loss": 0.0671, "step": 269 }, { "epoch": 1.8493150684931505, "grad_norm": 0.045925921341422184, "learning_rate": 1.729667199872187e-06, "loss": 0.0796, "step": 270 }, { "epoch": 1.856164383561644, "grad_norm": 0.047745127307766096, "learning_rate": 1.5768115010332208e-06, "loss": 0.0681, "step": 271 }, { "epoch": 1.8630136986301369, "grad_norm": 0.04828726659252653, "learning_rate": 1.4309179837122044e-06, "loss": 0.0637, "step": 272 }, { "epoch": 1.8698630136986303, "grad_norm": 0.03831444871977773, "learning_rate": 1.2920076241641376e-06, "loss": 0.0537, "step": 273 }, { "epoch": 1.8767123287671232, "grad_norm": 0.043143594829470666, "learning_rate": 1.1601003946206724e-06, "loss": 0.0645, "step": 274 }, { "epoch": 1.8835616438356164, "grad_norm": 0.04108818302281154, "learning_rate": 1.0352152604185428e-06, "loss": 0.0596, "step": 275 }, { "epoch": 1.8904109589041096, "grad_norm": 0.04419177931878111, "learning_rate": 9.17370177272775e-07, "loss": 0.0681, "step": 276 }, { "epoch": 1.8972602739726028, "grad_norm": 0.04544816624943878, "learning_rate": 8.065820886950404e-07, "loss": 0.0665, "step": 277 }, { "epoch": 1.904109589041096, "grad_norm": 0.044560046311441934, "learning_rate": 7.028669235575714e-07, "loss": 0.0665, "step": 278 }, { "epoch": 1.910958904109589, "grad_norm": 0.046336508445476945, "learning_rate": 6.062395938029485e-07, "loss": 0.0715, "step": 279 }, { "epoch": 1.9178082191780823, "grad_norm": 0.046783520939667735, "learning_rate": 5.167139923000553e-07, "loss": 0.0671, "step": 280 }, { "epoch": 1.9246575342465753, "grad_norm": 0.045134549211076444, "learning_rate": 4.343029908466301e-07, "loss": 0.0651, "step": 281 }, { "epoch": 1.9315068493150684, "grad_norm": 0.04620102733800356, "learning_rate": 3.5901843831857576e-07, "loss": 0.0611, "step": 282 }, { "epoch": 1.9383561643835616, "grad_norm": 0.04290459317190587, "learning_rate": 2.908711589663549e-07, "loss": 0.0688, "step": 283 }, { "epoch": 1.9452054794520548, "grad_norm": 0.0459065259141636, "learning_rate": 2.2987095085867937e-07, "loss": 0.0627, "step": 284 }, { "epoch": 1.952054794520548, "grad_norm": 0.062357302229075894, "learning_rate": 1.760265844738096e-07, "loss": 0.0792, "step": 285 }, { "epoch": 1.958904109589041, "grad_norm": 0.044164802827650715, "learning_rate": 1.2934580143851295e-07, "loss": 0.0601, "step": 286 }, { "epoch": 1.9657534246575343, "grad_norm": 0.03615207612143871, "learning_rate": 8.983531341500983e-08, "loss": 0.0512, "step": 287 }, { "epoch": 1.9726027397260273, "grad_norm": 0.04362072520783342, "learning_rate": 5.750080113598455e-08, "loss": 0.0616, "step": 288 }, { "epoch": 1.9794520547945207, "grad_norm": 0.04831491776296162, "learning_rate": 3.2346913587816275e-08, "loss": 0.0731, "step": 289 }, { "epoch": 1.9863013698630136, "grad_norm": 0.050700245606893124, "learning_rate": 1.4377267342158274e-08, "loss": 0.0733, "step": 290 }, { "epoch": 1.9931506849315068, "grad_norm": 0.04317508790841958, "learning_rate": 3.594446035964927e-09, "loss": 0.054, "step": 291 }, { "epoch": 2.0, "grad_norm": 0.05236597882946408, "learning_rate": 0.0, "loss": 0.0588, "step": 292 }, { "epoch": 2.0, "step": 292, "total_flos": 739456868941824.0, "train_loss": 0.07426239744032899, "train_runtime": 2224.6089, "train_samples_per_second": 0.523, "train_steps_per_second": 0.131 } ], "logging_steps": 1, "max_steps": 292, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 739456868941824.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }