{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6062647356012125, "eval_steps": 100, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006736274840013472, "grad_norm": 3.42309308052063, "learning_rate": 1.0067114093959731e-07, "loss": 0.4257, "step": 1 }, { "epoch": 0.0013472549680026945, "grad_norm": 3.701201915740967, "learning_rate": 2.0134228187919462e-07, "loss": 0.4285, "step": 2 }, { "epoch": 0.0020208824520040417, "grad_norm": 4.045602321624756, "learning_rate": 3.0201342281879193e-07, "loss": 0.4232, "step": 3 }, { "epoch": 0.002694509936005389, "grad_norm": 3.859919786453247, "learning_rate": 4.0268456375838924e-07, "loss": 0.4029, "step": 4 }, { "epoch": 0.003368137420006736, "grad_norm": 4.171447277069092, "learning_rate": 5.033557046979866e-07, "loss": 0.4158, "step": 5 }, { "epoch": 0.0040417649040080834, "grad_norm": 3.556626796722412, "learning_rate": 6.040268456375839e-07, "loss": 0.3945, "step": 6 }, { "epoch": 0.004715392388009431, "grad_norm": 3.78082537651062, "learning_rate": 7.046979865771813e-07, "loss": 0.372, "step": 7 }, { "epoch": 0.005389019872010778, "grad_norm": 3.459005355834961, "learning_rate": 8.053691275167785e-07, "loss": 0.392, "step": 8 }, { "epoch": 0.006062647356012125, "grad_norm": 3.6338694095611572, "learning_rate": 9.060402684563759e-07, "loss": 0.4221, "step": 9 }, { "epoch": 0.006736274840013472, "grad_norm": 3.6951706409454346, "learning_rate": 1.006711409395973e-06, "loss": 0.4504, "step": 10 }, { "epoch": 0.00740990232401482, "grad_norm": 2.708463668823242, "learning_rate": 1.1073825503355705e-06, "loss": 0.3848, "step": 11 }, { "epoch": 0.008083529808016167, "grad_norm": 2.9015040397644043, "learning_rate": 1.2080536912751677e-06, "loss": 0.3936, "step": 12 }, { "epoch": 0.008757157292017514, "grad_norm": 3.133338212966919, "learning_rate": 1.3087248322147651e-06, "loss": 0.3844, "step": 13 }, { "epoch": 0.009430784776018861, "grad_norm": 2.1770880222320557, "learning_rate": 1.4093959731543626e-06, "loss": 0.3474, "step": 14 }, { "epoch": 0.010104412260020209, "grad_norm": 1.66121244430542, "learning_rate": 1.5100671140939598e-06, "loss": 0.3646, "step": 15 }, { "epoch": 0.010778039744021556, "grad_norm": 1.725306510925293, "learning_rate": 1.610738255033557e-06, "loss": 0.3739, "step": 16 }, { "epoch": 0.011451667228022903, "grad_norm": 1.5393097400665283, "learning_rate": 1.7114093959731544e-06, "loss": 0.2967, "step": 17 }, { "epoch": 0.01212529471202425, "grad_norm": 1.6653029918670654, "learning_rate": 1.8120805369127518e-06, "loss": 0.3491, "step": 18 }, { "epoch": 0.012798922196025598, "grad_norm": 1.4329285621643066, "learning_rate": 1.912751677852349e-06, "loss": 0.3438, "step": 19 }, { "epoch": 0.013472549680026945, "grad_norm": 1.1590880155563354, "learning_rate": 2.013422818791946e-06, "loss": 0.2907, "step": 20 }, { "epoch": 0.014146177164028292, "grad_norm": 1.4018336534500122, "learning_rate": 2.1140939597315434e-06, "loss": 0.3504, "step": 21 }, { "epoch": 0.01481980464802964, "grad_norm": 1.201278805732727, "learning_rate": 2.214765100671141e-06, "loss": 0.3176, "step": 22 }, { "epoch": 0.015493432132030987, "grad_norm": 1.14249849319458, "learning_rate": 2.3154362416107382e-06, "loss": 0.3079, "step": 23 }, { "epoch": 0.016167059616032334, "grad_norm": 1.0337632894515991, "learning_rate": 2.4161073825503354e-06, "loss": 0.3039, "step": 24 }, { "epoch": 0.016840687100033683, "grad_norm": 0.9944117665290833, "learning_rate": 2.516778523489933e-06, "loss": 0.297, "step": 25 }, { "epoch": 0.017514314584035028, "grad_norm": 0.946663498878479, "learning_rate": 2.6174496644295303e-06, "loss": 0.3315, "step": 26 }, { "epoch": 0.018187942068036377, "grad_norm": 1.056069254875183, "learning_rate": 2.7181208053691275e-06, "loss": 0.3274, "step": 27 }, { "epoch": 0.018861569552037723, "grad_norm": 0.9784092903137207, "learning_rate": 2.818791946308725e-06, "loss": 0.3399, "step": 28 }, { "epoch": 0.01953519703603907, "grad_norm": 1.07163667678833, "learning_rate": 2.9194630872483223e-06, "loss": 0.3361, "step": 29 }, { "epoch": 0.020208824520040417, "grad_norm": 0.9870592951774597, "learning_rate": 3.0201342281879195e-06, "loss": 0.3026, "step": 30 }, { "epoch": 0.020882452004041766, "grad_norm": 0.9180539846420288, "learning_rate": 3.120805369127517e-06, "loss": 0.2716, "step": 31 }, { "epoch": 0.02155607948804311, "grad_norm": 0.8827613592147827, "learning_rate": 3.221476510067114e-06, "loss": 0.2623, "step": 32 }, { "epoch": 0.02222970697204446, "grad_norm": 0.8390945196151733, "learning_rate": 3.3221476510067116e-06, "loss": 0.2792, "step": 33 }, { "epoch": 0.022903334456045806, "grad_norm": 0.8577262163162231, "learning_rate": 3.4228187919463088e-06, "loss": 0.2906, "step": 34 }, { "epoch": 0.023576961940047155, "grad_norm": 0.7939577102661133, "learning_rate": 3.523489932885906e-06, "loss": 0.2569, "step": 35 }, { "epoch": 0.0242505894240485, "grad_norm": 0.8591914772987366, "learning_rate": 3.6241610738255036e-06, "loss": 0.3072, "step": 36 }, { "epoch": 0.02492421690804985, "grad_norm": 0.8437011241912842, "learning_rate": 3.724832214765101e-06, "loss": 0.3002, "step": 37 }, { "epoch": 0.025597844392051195, "grad_norm": 0.8370192646980286, "learning_rate": 3.825503355704698e-06, "loss": 0.2693, "step": 38 }, { "epoch": 0.026271471876052544, "grad_norm": 0.7814688086509705, "learning_rate": 3.926174496644295e-06, "loss": 0.2816, "step": 39 }, { "epoch": 0.02694509936005389, "grad_norm": 0.8348170518875122, "learning_rate": 4.026845637583892e-06, "loss": 0.251, "step": 40 }, { "epoch": 0.02761872684405524, "grad_norm": 0.7987892627716064, "learning_rate": 4.12751677852349e-06, "loss": 0.2637, "step": 41 }, { "epoch": 0.028292354328056584, "grad_norm": 0.8840119242668152, "learning_rate": 4.228187919463087e-06, "loss": 0.315, "step": 42 }, { "epoch": 0.028965981812057933, "grad_norm": 0.7633718848228455, "learning_rate": 4.328859060402685e-06, "loss": 0.2988, "step": 43 }, { "epoch": 0.02963960929605928, "grad_norm": 0.7476988434791565, "learning_rate": 4.429530201342282e-06, "loss": 0.2856, "step": 44 }, { "epoch": 0.030313236780060628, "grad_norm": 0.7812101244926453, "learning_rate": 4.530201342281879e-06, "loss": 0.2622, "step": 45 }, { "epoch": 0.030986864264061973, "grad_norm": 0.7842202186584473, "learning_rate": 4.6308724832214765e-06, "loss": 0.3366, "step": 46 }, { "epoch": 0.03166049174806332, "grad_norm": 0.7462322115898132, "learning_rate": 4.731543624161074e-06, "loss": 0.2825, "step": 47 }, { "epoch": 0.03233411923206467, "grad_norm": 0.7542632818222046, "learning_rate": 4.832214765100671e-06, "loss": 0.2549, "step": 48 }, { "epoch": 0.033007746716066017, "grad_norm": 0.7200729250907898, "learning_rate": 4.932885906040269e-06, "loss": 0.2593, "step": 49 }, { "epoch": 0.033681374200067365, "grad_norm": 0.8686407208442688, "learning_rate": 5.033557046979866e-06, "loss": 0.2828, "step": 50 }, { "epoch": 0.03435500168406871, "grad_norm": 0.734254002571106, "learning_rate": 5.134228187919463e-06, "loss": 0.2841, "step": 51 }, { "epoch": 0.035028629168070056, "grad_norm": 0.7483602166175842, "learning_rate": 5.2348993288590606e-06, "loss": 0.2956, "step": 52 }, { "epoch": 0.035702256652071405, "grad_norm": 0.7722125053405762, "learning_rate": 5.335570469798658e-06, "loss": 0.2817, "step": 53 }, { "epoch": 0.036375884136072754, "grad_norm": 0.7247833609580994, "learning_rate": 5.436241610738255e-06, "loss": 0.2589, "step": 54 }, { "epoch": 0.037049511620074096, "grad_norm": 0.8258161544799805, "learning_rate": 5.536912751677853e-06, "loss": 0.2929, "step": 55 }, { "epoch": 0.037723139104075445, "grad_norm": 0.784130334854126, "learning_rate": 5.63758389261745e-06, "loss": 0.2639, "step": 56 }, { "epoch": 0.038396766588076794, "grad_norm": 0.8519976735115051, "learning_rate": 5.738255033557047e-06, "loss": 0.2611, "step": 57 }, { "epoch": 0.03907039407207814, "grad_norm": 0.7617088556289673, "learning_rate": 5.838926174496645e-06, "loss": 0.3038, "step": 58 }, { "epoch": 0.039744021556079485, "grad_norm": 0.7174592018127441, "learning_rate": 5.939597315436242e-06, "loss": 0.2451, "step": 59 }, { "epoch": 0.040417649040080834, "grad_norm": 0.7933776378631592, "learning_rate": 6.040268456375839e-06, "loss": 0.2979, "step": 60 }, { "epoch": 0.04109127652408218, "grad_norm": 0.7308351993560791, "learning_rate": 6.140939597315437e-06, "loss": 0.2547, "step": 61 }, { "epoch": 0.04176490400808353, "grad_norm": 0.8782221674919128, "learning_rate": 6.241610738255034e-06, "loss": 0.2948, "step": 62 }, { "epoch": 0.042438531492084874, "grad_norm": 0.7220450043678284, "learning_rate": 6.342281879194631e-06, "loss": 0.2397, "step": 63 }, { "epoch": 0.04311215897608622, "grad_norm": 0.8042862415313721, "learning_rate": 6.442953020134228e-06, "loss": 0.2963, "step": 64 }, { "epoch": 0.04378578646008757, "grad_norm": 0.6996918320655823, "learning_rate": 6.543624161073825e-06, "loss": 0.2542, "step": 65 }, { "epoch": 0.04445941394408892, "grad_norm": 0.7606627941131592, "learning_rate": 6.644295302013423e-06, "loss": 0.285, "step": 66 }, { "epoch": 0.04513304142809026, "grad_norm": 0.8591688275337219, "learning_rate": 6.74496644295302e-06, "loss": 0.2671, "step": 67 }, { "epoch": 0.04580666891209161, "grad_norm": 0.8488709330558777, "learning_rate": 6.8456375838926175e-06, "loss": 0.2751, "step": 68 }, { "epoch": 0.04648029639609296, "grad_norm": 0.7567676305770874, "learning_rate": 6.946308724832215e-06, "loss": 0.301, "step": 69 }, { "epoch": 0.04715392388009431, "grad_norm": 0.7121560573577881, "learning_rate": 7.046979865771812e-06, "loss": 0.2557, "step": 70 }, { "epoch": 0.04782755136409565, "grad_norm": 0.7666682600975037, "learning_rate": 7.147651006711409e-06, "loss": 0.2582, "step": 71 }, { "epoch": 0.048501178848097, "grad_norm": 0.7414038181304932, "learning_rate": 7.248322147651007e-06, "loss": 0.262, "step": 72 }, { "epoch": 0.04917480633209835, "grad_norm": 0.8357811570167542, "learning_rate": 7.348993288590604e-06, "loss": 0.2591, "step": 73 }, { "epoch": 0.0498484338160997, "grad_norm": 0.7933549880981445, "learning_rate": 7.449664429530202e-06, "loss": 0.282, "step": 74 }, { "epoch": 0.05052206130010104, "grad_norm": 0.7420201301574707, "learning_rate": 7.5503355704698e-06, "loss": 0.2469, "step": 75 }, { "epoch": 0.05119568878410239, "grad_norm": 0.7670828104019165, "learning_rate": 7.651006711409396e-06, "loss": 0.295, "step": 76 }, { "epoch": 0.05186931626810374, "grad_norm": 0.722752571105957, "learning_rate": 7.751677852348993e-06, "loss": 0.2301, "step": 77 }, { "epoch": 0.05254294375210509, "grad_norm": 0.7430191040039062, "learning_rate": 7.85234899328859e-06, "loss": 0.2875, "step": 78 }, { "epoch": 0.05321657123610643, "grad_norm": 0.6979767084121704, "learning_rate": 7.953020134228188e-06, "loss": 0.2326, "step": 79 }, { "epoch": 0.05389019872010778, "grad_norm": 0.7197319269180298, "learning_rate": 8.053691275167785e-06, "loss": 0.2413, "step": 80 }, { "epoch": 0.05456382620410913, "grad_norm": 0.7689131498336792, "learning_rate": 8.154362416107382e-06, "loss": 0.2868, "step": 81 }, { "epoch": 0.05523745368811048, "grad_norm": 0.7233304381370544, "learning_rate": 8.25503355704698e-06, "loss": 0.2586, "step": 82 }, { "epoch": 0.05591108117211182, "grad_norm": 0.8464373350143433, "learning_rate": 8.355704697986576e-06, "loss": 0.2998, "step": 83 }, { "epoch": 0.05658470865611317, "grad_norm": 0.8020244240760803, "learning_rate": 8.456375838926174e-06, "loss": 0.3323, "step": 84 }, { "epoch": 0.05725833614011452, "grad_norm": 0.9260913729667664, "learning_rate": 8.55704697986577e-06, "loss": 0.3353, "step": 85 }, { "epoch": 0.057931963624115866, "grad_norm": 0.824252188205719, "learning_rate": 8.65771812080537e-06, "loss": 0.2778, "step": 86 }, { "epoch": 0.05860559110811721, "grad_norm": 0.7277565598487854, "learning_rate": 8.758389261744967e-06, "loss": 0.2863, "step": 87 }, { "epoch": 0.05927921859211856, "grad_norm": 0.7575395107269287, "learning_rate": 8.859060402684564e-06, "loss": 0.2192, "step": 88 }, { "epoch": 0.059952846076119906, "grad_norm": 0.7741091251373291, "learning_rate": 8.959731543624161e-06, "loss": 0.2808, "step": 89 }, { "epoch": 0.060626473560121255, "grad_norm": 0.7291881442070007, "learning_rate": 9.060402684563759e-06, "loss": 0.2624, "step": 90 }, { "epoch": 0.0613001010441226, "grad_norm": 0.7662385106086731, "learning_rate": 9.161073825503356e-06, "loss": 0.2803, "step": 91 }, { "epoch": 0.061973728528123946, "grad_norm": 0.7009522914886475, "learning_rate": 9.261744966442953e-06, "loss": 0.26, "step": 92 }, { "epoch": 0.06264735601212529, "grad_norm": 0.8707520365715027, "learning_rate": 9.36241610738255e-06, "loss": 0.3179, "step": 93 }, { "epoch": 0.06332098349612664, "grad_norm": 0.8629103302955627, "learning_rate": 9.463087248322147e-06, "loss": 0.3065, "step": 94 }, { "epoch": 0.06399461098012799, "grad_norm": 0.8592970371246338, "learning_rate": 9.563758389261745e-06, "loss": 0.2574, "step": 95 }, { "epoch": 0.06466823846412934, "grad_norm": 0.8038861751556396, "learning_rate": 9.664429530201342e-06, "loss": 0.2699, "step": 96 }, { "epoch": 0.06534186594813068, "grad_norm": 0.7168505787849426, "learning_rate": 9.765100671140939e-06, "loss": 0.2507, "step": 97 }, { "epoch": 0.06601549343213203, "grad_norm": 0.7545929551124573, "learning_rate": 9.865771812080538e-06, "loss": 0.2922, "step": 98 }, { "epoch": 0.06668912091613338, "grad_norm": 0.7718814611434937, "learning_rate": 9.966442953020135e-06, "loss": 0.254, "step": 99 }, { "epoch": 0.06736274840013473, "grad_norm": 0.8245450854301453, "learning_rate": 1.0067114093959732e-05, "loss": 0.2869, "step": 100 }, { "epoch": 0.06736274840013473, "eval_loss": 0.2735002040863037, "eval_runtime": 104.2064, "eval_samples_per_second": 47.982, "eval_steps_per_second": 3.004, "step": 100 }, { "epoch": 0.06803637588413607, "grad_norm": 0.9367719888687134, "learning_rate": 1.016778523489933e-05, "loss": 0.3273, "step": 101 }, { "epoch": 0.06871000336813742, "grad_norm": 0.7697410583496094, "learning_rate": 1.0268456375838927e-05, "loss": 0.2493, "step": 102 }, { "epoch": 0.06938363085213876, "grad_norm": 0.7449803948402405, "learning_rate": 1.0369127516778524e-05, "loss": 0.2558, "step": 103 }, { "epoch": 0.07005725833614011, "grad_norm": 0.7809726595878601, "learning_rate": 1.0469798657718121e-05, "loss": 0.3079, "step": 104 }, { "epoch": 0.07073088582014146, "grad_norm": 0.8014216423034668, "learning_rate": 1.0570469798657718e-05, "loss": 0.2774, "step": 105 }, { "epoch": 0.07140451330414281, "grad_norm": 0.7782856225967407, "learning_rate": 1.0671140939597316e-05, "loss": 0.2874, "step": 106 }, { "epoch": 0.07207814078814416, "grad_norm": 0.7489345669746399, "learning_rate": 1.0771812080536913e-05, "loss": 0.2618, "step": 107 }, { "epoch": 0.07275176827214551, "grad_norm": 0.7881894111633301, "learning_rate": 1.087248322147651e-05, "loss": 0.2914, "step": 108 }, { "epoch": 0.07342539575614684, "grad_norm": 0.8149965405464172, "learning_rate": 1.0973154362416109e-05, "loss": 0.2904, "step": 109 }, { "epoch": 0.07409902324014819, "grad_norm": 0.8088157176971436, "learning_rate": 1.1073825503355706e-05, "loss": 0.3084, "step": 110 }, { "epoch": 0.07477265072414954, "grad_norm": 0.804861843585968, "learning_rate": 1.1174496644295303e-05, "loss": 0.2919, "step": 111 }, { "epoch": 0.07544627820815089, "grad_norm": 0.7035599946975708, "learning_rate": 1.12751677852349e-05, "loss": 0.2971, "step": 112 }, { "epoch": 0.07611990569215224, "grad_norm": 0.8036991357803345, "learning_rate": 1.1375838926174498e-05, "loss": 0.2857, "step": 113 }, { "epoch": 0.07679353317615359, "grad_norm": 0.6793683767318726, "learning_rate": 1.1476510067114095e-05, "loss": 0.2742, "step": 114 }, { "epoch": 0.07746716066015494, "grad_norm": 0.7865248918533325, "learning_rate": 1.1577181208053692e-05, "loss": 0.3156, "step": 115 }, { "epoch": 0.07814078814415629, "grad_norm": 0.6990460157394409, "learning_rate": 1.167785234899329e-05, "loss": 0.281, "step": 116 }, { "epoch": 0.07881441562815762, "grad_norm": 0.7218809723854065, "learning_rate": 1.1778523489932886e-05, "loss": 0.2584, "step": 117 }, { "epoch": 0.07948804311215897, "grad_norm": 0.6970985531806946, "learning_rate": 1.1879194630872484e-05, "loss": 0.2438, "step": 118 }, { "epoch": 0.08016167059616032, "grad_norm": 0.7687243819236755, "learning_rate": 1.1979865771812081e-05, "loss": 0.2846, "step": 119 }, { "epoch": 0.08083529808016167, "grad_norm": 0.6929764151573181, "learning_rate": 1.2080536912751678e-05, "loss": 0.2611, "step": 120 }, { "epoch": 0.08150892556416302, "grad_norm": 0.729848325252533, "learning_rate": 1.2181208053691277e-05, "loss": 0.3007, "step": 121 }, { "epoch": 0.08218255304816437, "grad_norm": 0.7301985025405884, "learning_rate": 1.2281879194630874e-05, "loss": 0.2847, "step": 122 }, { "epoch": 0.08285618053216572, "grad_norm": 0.7333296537399292, "learning_rate": 1.2382550335570471e-05, "loss": 0.2674, "step": 123 }, { "epoch": 0.08352980801616706, "grad_norm": 0.7411990165710449, "learning_rate": 1.2483221476510069e-05, "loss": 0.2777, "step": 124 }, { "epoch": 0.08420343550016841, "grad_norm": 0.6465498805046082, "learning_rate": 1.2583892617449664e-05, "loss": 0.2579, "step": 125 }, { "epoch": 0.08487706298416975, "grad_norm": 0.6950599551200867, "learning_rate": 1.2684563758389261e-05, "loss": 0.3164, "step": 126 }, { "epoch": 0.0855506904681711, "grad_norm": 0.6696597337722778, "learning_rate": 1.2785234899328858e-05, "loss": 0.2564, "step": 127 }, { "epoch": 0.08622431795217245, "grad_norm": 0.6537868976593018, "learning_rate": 1.2885906040268456e-05, "loss": 0.2375, "step": 128 }, { "epoch": 0.0868979454361738, "grad_norm": 0.7363224029541016, "learning_rate": 1.2986577181208053e-05, "loss": 0.2589, "step": 129 }, { "epoch": 0.08757157292017514, "grad_norm": 0.7354284524917603, "learning_rate": 1.308724832214765e-05, "loss": 0.3049, "step": 130 }, { "epoch": 0.0882452004041765, "grad_norm": 0.6521575450897217, "learning_rate": 1.3187919463087247e-05, "loss": 0.2385, "step": 131 }, { "epoch": 0.08891882788817784, "grad_norm": 0.6530443429946899, "learning_rate": 1.3288590604026846e-05, "loss": 0.2588, "step": 132 }, { "epoch": 0.08959245537217919, "grad_norm": 0.7331404089927673, "learning_rate": 1.3389261744966443e-05, "loss": 0.3061, "step": 133 }, { "epoch": 0.09026608285618053, "grad_norm": 0.7427138090133667, "learning_rate": 1.348993288590604e-05, "loss": 0.3513, "step": 134 }, { "epoch": 0.09093971034018188, "grad_norm": 0.6774203181266785, "learning_rate": 1.3590604026845638e-05, "loss": 0.2639, "step": 135 }, { "epoch": 0.09161333782418322, "grad_norm": 0.6679060459136963, "learning_rate": 1.3691275167785235e-05, "loss": 0.2503, "step": 136 }, { "epoch": 0.09228696530818457, "grad_norm": 0.6390411853790283, "learning_rate": 1.3791946308724832e-05, "loss": 0.2298, "step": 137 }, { "epoch": 0.09296059279218592, "grad_norm": 0.7115532159805298, "learning_rate": 1.389261744966443e-05, "loss": 0.255, "step": 138 }, { "epoch": 0.09363422027618727, "grad_norm": 0.6546367406845093, "learning_rate": 1.3993288590604027e-05, "loss": 0.2623, "step": 139 }, { "epoch": 0.09430784776018862, "grad_norm": 0.7526003122329712, "learning_rate": 1.4093959731543624e-05, "loss": 0.2701, "step": 140 }, { "epoch": 0.09498147524418997, "grad_norm": 0.7417687773704529, "learning_rate": 1.4194630872483221e-05, "loss": 0.2488, "step": 141 }, { "epoch": 0.0956551027281913, "grad_norm": 0.6994727849960327, "learning_rate": 1.4295302013422818e-05, "loss": 0.2861, "step": 142 }, { "epoch": 0.09632873021219265, "grad_norm": 0.7503766417503357, "learning_rate": 1.4395973154362415e-05, "loss": 0.3002, "step": 143 }, { "epoch": 0.097002357696194, "grad_norm": 0.6777353882789612, "learning_rate": 1.4496644295302014e-05, "loss": 0.2548, "step": 144 }, { "epoch": 0.09767598518019535, "grad_norm": 0.8131176829338074, "learning_rate": 1.4597315436241612e-05, "loss": 0.2736, "step": 145 }, { "epoch": 0.0983496126641967, "grad_norm": 0.6841787099838257, "learning_rate": 1.4697986577181209e-05, "loss": 0.2647, "step": 146 }, { "epoch": 0.09902324014819805, "grad_norm": 0.673572838306427, "learning_rate": 1.4798657718120806e-05, "loss": 0.2414, "step": 147 }, { "epoch": 0.0996968676321994, "grad_norm": 0.6950225234031677, "learning_rate": 1.4899328859060403e-05, "loss": 0.268, "step": 148 }, { "epoch": 0.10037049511620075, "grad_norm": 0.7058023810386658, "learning_rate": 1.5e-05, "loss": 0.2682, "step": 149 }, { "epoch": 0.10104412260020208, "grad_norm": 0.7642398476600647, "learning_rate": 1.4999979233262118e-05, "loss": 0.2871, "step": 150 }, { "epoch": 0.10171775008420343, "grad_norm": 0.7045179605484009, "learning_rate": 1.4999916933163468e-05, "loss": 0.2589, "step": 151 }, { "epoch": 0.10239137756820478, "grad_norm": 0.6908326148986816, "learning_rate": 1.499981310004906e-05, "loss": 0.2727, "step": 152 }, { "epoch": 0.10306500505220613, "grad_norm": 0.7265616655349731, "learning_rate": 1.4999667734493901e-05, "loss": 0.3177, "step": 153 }, { "epoch": 0.10373863253620748, "grad_norm": 0.630407452583313, "learning_rate": 1.4999480837302995e-05, "loss": 0.2636, "step": 154 }, { "epoch": 0.10441226002020883, "grad_norm": 0.6864127516746521, "learning_rate": 1.4999252409511335e-05, "loss": 0.3013, "step": 155 }, { "epoch": 0.10508588750421018, "grad_norm": 0.7556886076927185, "learning_rate": 1.4998982452383916e-05, "loss": 0.279, "step": 156 }, { "epoch": 0.10575951498821153, "grad_norm": 0.7267988324165344, "learning_rate": 1.4998670967415701e-05, "loss": 0.2528, "step": 157 }, { "epoch": 0.10643314247221286, "grad_norm": 0.6894652843475342, "learning_rate": 1.4998317956331634e-05, "loss": 0.2833, "step": 158 }, { "epoch": 0.10710676995621421, "grad_norm": 0.7065450549125671, "learning_rate": 1.4997923421086613e-05, "loss": 0.3159, "step": 159 }, { "epoch": 0.10778039744021556, "grad_norm": 0.6692951321601868, "learning_rate": 1.49974873638655e-05, "loss": 0.2747, "step": 160 }, { "epoch": 0.10845402492421691, "grad_norm": 0.589299738407135, "learning_rate": 1.4997009787083088e-05, "loss": 0.2436, "step": 161 }, { "epoch": 0.10912765240821826, "grad_norm": 0.6986613869667053, "learning_rate": 1.49964906933841e-05, "loss": 0.2893, "step": 162 }, { "epoch": 0.1098012798922196, "grad_norm": 0.6756588220596313, "learning_rate": 1.4995930085643173e-05, "loss": 0.3076, "step": 163 }, { "epoch": 0.11047490737622095, "grad_norm": 0.6988603472709656, "learning_rate": 1.4995327966964838e-05, "loss": 0.2646, "step": 164 }, { "epoch": 0.1111485348602223, "grad_norm": 0.6961201429367065, "learning_rate": 1.4994684340683506e-05, "loss": 0.2984, "step": 165 }, { "epoch": 0.11182216234422364, "grad_norm": 0.7064459323883057, "learning_rate": 1.4993999210363444e-05, "loss": 0.3186, "step": 166 }, { "epoch": 0.11249578982822499, "grad_norm": 0.6374897360801697, "learning_rate": 1.4993272579798773e-05, "loss": 0.2833, "step": 167 }, { "epoch": 0.11316941731222634, "grad_norm": 0.6672942638397217, "learning_rate": 1.4992504453013422e-05, "loss": 0.2891, "step": 168 }, { "epoch": 0.11384304479622769, "grad_norm": 0.6631248593330383, "learning_rate": 1.499169483426112e-05, "loss": 0.2512, "step": 169 }, { "epoch": 0.11451667228022903, "grad_norm": 0.7132297158241272, "learning_rate": 1.4990843728025367e-05, "loss": 0.2988, "step": 170 }, { "epoch": 0.11519029976423038, "grad_norm": 0.6612878441810608, "learning_rate": 1.4989951139019425e-05, "loss": 0.283, "step": 171 }, { "epoch": 0.11586392724823173, "grad_norm": 0.6382921934127808, "learning_rate": 1.4989017072186267e-05, "loss": 0.2597, "step": 172 }, { "epoch": 0.11653755473223308, "grad_norm": 0.5888513922691345, "learning_rate": 1.498804153269856e-05, "loss": 0.243, "step": 173 }, { "epoch": 0.11721118221623442, "grad_norm": 0.7310932874679565, "learning_rate": 1.498702452595865e-05, "loss": 0.2871, "step": 174 }, { "epoch": 0.11788480970023577, "grad_norm": 0.6769680380821228, "learning_rate": 1.4985966057598512e-05, "loss": 0.2896, "step": 175 }, { "epoch": 0.11855843718423711, "grad_norm": 0.7013587355613708, "learning_rate": 1.4984866133479729e-05, "loss": 0.2913, "step": 176 }, { "epoch": 0.11923206466823846, "grad_norm": 0.7067077159881592, "learning_rate": 1.4983724759693456e-05, "loss": 0.2931, "step": 177 }, { "epoch": 0.11990569215223981, "grad_norm": 0.6384806632995605, "learning_rate": 1.498254194256039e-05, "loss": 0.2433, "step": 178 }, { "epoch": 0.12057931963624116, "grad_norm": 0.733525276184082, "learning_rate": 1.4981317688630729e-05, "loss": 0.314, "step": 179 }, { "epoch": 0.12125294712024251, "grad_norm": 0.6598628759384155, "learning_rate": 1.4980052004684146e-05, "loss": 0.281, "step": 180 }, { "epoch": 0.12192657460424386, "grad_norm": 0.616263210773468, "learning_rate": 1.4978744897729741e-05, "loss": 0.2616, "step": 181 }, { "epoch": 0.1226002020882452, "grad_norm": 0.6175768971443176, "learning_rate": 1.4977396375006006e-05, "loss": 0.2624, "step": 182 }, { "epoch": 0.12327382957224654, "grad_norm": 0.676030695438385, "learning_rate": 1.4976006443980785e-05, "loss": 0.287, "step": 183 }, { "epoch": 0.12394745705624789, "grad_norm": 0.6331183314323425, "learning_rate": 1.4974575112351235e-05, "loss": 0.2647, "step": 184 }, { "epoch": 0.12462108454024924, "grad_norm": 0.656204104423523, "learning_rate": 1.497310238804378e-05, "loss": 0.2755, "step": 185 }, { "epoch": 0.12529471202425058, "grad_norm": 0.6582143306732178, "learning_rate": 1.4971588279214065e-05, "loss": 0.2774, "step": 186 }, { "epoch": 0.12596833950825193, "grad_norm": 0.6152216792106628, "learning_rate": 1.4970032794246918e-05, "loss": 0.2694, "step": 187 }, { "epoch": 0.12664196699225327, "grad_norm": 0.5943458676338196, "learning_rate": 1.4968435941756303e-05, "loss": 0.2698, "step": 188 }, { "epoch": 0.12731559447625462, "grad_norm": 0.7527596354484558, "learning_rate": 1.496679773058526e-05, "loss": 0.2996, "step": 189 }, { "epoch": 0.12798922196025597, "grad_norm": 0.6229069828987122, "learning_rate": 1.4965118169805868e-05, "loss": 0.275, "step": 190 }, { "epoch": 0.12866284944425732, "grad_norm": 0.620919406414032, "learning_rate": 1.4963397268719198e-05, "loss": 0.2956, "step": 191 }, { "epoch": 0.12933647692825867, "grad_norm": 0.6090366244316101, "learning_rate": 1.4961635036855249e-05, "loss": 0.258, "step": 192 }, { "epoch": 0.13001010441226002, "grad_norm": 0.5942346453666687, "learning_rate": 1.4959831483972901e-05, "loss": 0.266, "step": 193 }, { "epoch": 0.13068373189626137, "grad_norm": 0.6019350290298462, "learning_rate": 1.4957986620059866e-05, "loss": 0.256, "step": 194 }, { "epoch": 0.13135735938026272, "grad_norm": 0.6708882451057434, "learning_rate": 1.4956100455332623e-05, "loss": 0.2924, "step": 195 }, { "epoch": 0.13203098686426407, "grad_norm": 0.7132793068885803, "learning_rate": 1.4954173000236369e-05, "loss": 0.3174, "step": 196 }, { "epoch": 0.13270461434826542, "grad_norm": 0.602311909198761, "learning_rate": 1.495220426544496e-05, "loss": 0.2388, "step": 197 }, { "epoch": 0.13337824183226676, "grad_norm": 0.5862560868263245, "learning_rate": 1.495019426186085e-05, "loss": 0.2382, "step": 198 }, { "epoch": 0.1340518693162681, "grad_norm": 0.6618714332580566, "learning_rate": 1.4948143000615028e-05, "loss": 0.2654, "step": 199 }, { "epoch": 0.13472549680026946, "grad_norm": 0.6195774078369141, "learning_rate": 1.4946050493066965e-05, "loss": 0.2696, "step": 200 }, { "epoch": 0.13472549680026946, "eval_loss": 0.2768155038356781, "eval_runtime": 105.0569, "eval_samples_per_second": 47.593, "eval_steps_per_second": 2.979, "step": 200 }, { "epoch": 0.1353991242842708, "grad_norm": 0.5954621434211731, "learning_rate": 1.4943916750804537e-05, "loss": 0.2625, "step": 201 }, { "epoch": 0.13607275176827213, "grad_norm": 0.610717236995697, "learning_rate": 1.494174178564398e-05, "loss": 0.2953, "step": 202 }, { "epoch": 0.13674637925227348, "grad_norm": 0.6930943727493286, "learning_rate": 1.4939525609629809e-05, "loss": 0.2774, "step": 203 }, { "epoch": 0.13742000673627483, "grad_norm": 0.6402983069419861, "learning_rate": 1.4937268235034754e-05, "loss": 0.2814, "step": 204 }, { "epoch": 0.13809363422027618, "grad_norm": 0.6476616859436035, "learning_rate": 1.4934969674359698e-05, "loss": 0.2829, "step": 205 }, { "epoch": 0.13876726170427753, "grad_norm": 0.6163775324821472, "learning_rate": 1.49326299403336e-05, "loss": 0.2682, "step": 206 }, { "epoch": 0.13944088918827888, "grad_norm": 0.6615155935287476, "learning_rate": 1.4930249045913437e-05, "loss": 0.2656, "step": 207 }, { "epoch": 0.14011451667228023, "grad_norm": 0.6666435599327087, "learning_rate": 1.4927827004284117e-05, "loss": 0.2972, "step": 208 }, { "epoch": 0.14078814415628157, "grad_norm": 0.6047382950782776, "learning_rate": 1.4925363828858407e-05, "loss": 0.2527, "step": 209 }, { "epoch": 0.14146177164028292, "grad_norm": 0.6405648589134216, "learning_rate": 1.4922859533276882e-05, "loss": 0.2589, "step": 210 }, { "epoch": 0.14213539912428427, "grad_norm": 0.6201145648956299, "learning_rate": 1.4920314131407817e-05, "loss": 0.2419, "step": 211 }, { "epoch": 0.14280902660828562, "grad_norm": 0.6683364510536194, "learning_rate": 1.4917727637347132e-05, "loss": 0.2973, "step": 212 }, { "epoch": 0.14348265409228697, "grad_norm": 0.5999878644943237, "learning_rate": 1.4915100065418302e-05, "loss": 0.2714, "step": 213 }, { "epoch": 0.14415628157628832, "grad_norm": 0.6046174764633179, "learning_rate": 1.491243143017229e-05, "loss": 0.2841, "step": 214 }, { "epoch": 0.14482990906028967, "grad_norm": 0.6034740209579468, "learning_rate": 1.4909721746387454e-05, "loss": 0.2896, "step": 215 }, { "epoch": 0.14550353654429102, "grad_norm": 0.6835145354270935, "learning_rate": 1.4906971029069473e-05, "loss": 0.2778, "step": 216 }, { "epoch": 0.14617716402829237, "grad_norm": 0.6769616603851318, "learning_rate": 1.490417929345126e-05, "loss": 0.2697, "step": 217 }, { "epoch": 0.1468507915122937, "grad_norm": 0.6558434367179871, "learning_rate": 1.4901346554992879e-05, "loss": 0.2708, "step": 218 }, { "epoch": 0.14752441899629504, "grad_norm": 0.6363021731376648, "learning_rate": 1.489847282938146e-05, "loss": 0.297, "step": 219 }, { "epoch": 0.14819804648029639, "grad_norm": 0.6437724828720093, "learning_rate": 1.4895558132531112e-05, "loss": 0.2827, "step": 220 }, { "epoch": 0.14887167396429773, "grad_norm": 0.6295124292373657, "learning_rate": 1.4892602480582836e-05, "loss": 0.2998, "step": 221 }, { "epoch": 0.14954530144829908, "grad_norm": 0.634768545627594, "learning_rate": 1.4889605889904426e-05, "loss": 0.2686, "step": 222 }, { "epoch": 0.15021892893230043, "grad_norm": 0.624239981174469, "learning_rate": 1.4886568377090396e-05, "loss": 0.3161, "step": 223 }, { "epoch": 0.15089255641630178, "grad_norm": 0.6285136342048645, "learning_rate": 1.4883489958961875e-05, "loss": 0.3089, "step": 224 }, { "epoch": 0.15156618390030313, "grad_norm": 0.6140178442001343, "learning_rate": 1.4880370652566516e-05, "loss": 0.2888, "step": 225 }, { "epoch": 0.15223981138430448, "grad_norm": 0.5987722873687744, "learning_rate": 1.4877210475178403e-05, "loss": 0.2586, "step": 226 }, { "epoch": 0.15291343886830583, "grad_norm": 0.6315680146217346, "learning_rate": 1.487400944429796e-05, "loss": 0.2876, "step": 227 }, { "epoch": 0.15358706635230718, "grad_norm": 0.6932382583618164, "learning_rate": 1.487076757765184e-05, "loss": 0.2886, "step": 228 }, { "epoch": 0.15426069383630853, "grad_norm": 0.5736963748931885, "learning_rate": 1.4867484893192847e-05, "loss": 0.2524, "step": 229 }, { "epoch": 0.15493432132030988, "grad_norm": 0.6102257370948792, "learning_rate": 1.4864161409099814e-05, "loss": 0.2518, "step": 230 }, { "epoch": 0.15560794880431122, "grad_norm": 0.5340930819511414, "learning_rate": 1.4860797143777526e-05, "loss": 0.2466, "step": 231 }, { "epoch": 0.15628157628831257, "grad_norm": 0.6170995831489563, "learning_rate": 1.4857392115856597e-05, "loss": 0.2588, "step": 232 }, { "epoch": 0.15695520377231392, "grad_norm": 0.5439332127571106, "learning_rate": 1.4853946344193386e-05, "loss": 0.2377, "step": 233 }, { "epoch": 0.15762883125631524, "grad_norm": 0.6084430813789368, "learning_rate": 1.4850459847869866e-05, "loss": 0.2514, "step": 234 }, { "epoch": 0.1583024587403166, "grad_norm": 0.6239585280418396, "learning_rate": 1.4846932646193554e-05, "loss": 0.2892, "step": 235 }, { "epoch": 0.15897608622431794, "grad_norm": 0.6361899375915527, "learning_rate": 1.4843364758697371e-05, "loss": 0.264, "step": 236 }, { "epoch": 0.1596497137083193, "grad_norm": 0.5994705557823181, "learning_rate": 1.4839756205139555e-05, "loss": 0.2756, "step": 237 }, { "epoch": 0.16032334119232064, "grad_norm": 0.6532281041145325, "learning_rate": 1.4836107005503543e-05, "loss": 0.3262, "step": 238 }, { "epoch": 0.160996968676322, "grad_norm": 0.6311124563217163, "learning_rate": 1.483241717999786e-05, "loss": 0.3137, "step": 239 }, { "epoch": 0.16167059616032334, "grad_norm": 0.5731788873672485, "learning_rate": 1.4828686749056007e-05, "loss": 0.2476, "step": 240 }, { "epoch": 0.1623442236443247, "grad_norm": 0.5689460039138794, "learning_rate": 1.4824915733336355e-05, "loss": 0.2717, "step": 241 }, { "epoch": 0.16301785112832604, "grad_norm": 0.6340669989585876, "learning_rate": 1.4821104153722023e-05, "loss": 0.2756, "step": 242 }, { "epoch": 0.16369147861232738, "grad_norm": 0.6497682929039001, "learning_rate": 1.4817252031320766e-05, "loss": 0.3197, "step": 243 }, { "epoch": 0.16436510609632873, "grad_norm": 0.6404630541801453, "learning_rate": 1.481335938746485e-05, "loss": 0.2641, "step": 244 }, { "epoch": 0.16503873358033008, "grad_norm": 0.5862687230110168, "learning_rate": 1.480942624371095e-05, "loss": 0.261, "step": 245 }, { "epoch": 0.16571236106433143, "grad_norm": 0.6154356598854065, "learning_rate": 1.4805452621840015e-05, "loss": 0.2856, "step": 246 }, { "epoch": 0.16638598854833278, "grad_norm": 0.7411592602729797, "learning_rate": 1.4801438543857154e-05, "loss": 0.2838, "step": 247 }, { "epoch": 0.16705961603233413, "grad_norm": 0.6304882764816284, "learning_rate": 1.479738403199152e-05, "loss": 0.3102, "step": 248 }, { "epoch": 0.16773324351633548, "grad_norm": 0.5838252305984497, "learning_rate": 1.479328910869617e-05, "loss": 0.3074, "step": 249 }, { "epoch": 0.16840687100033683, "grad_norm": 0.6592857241630554, "learning_rate": 1.4789153796647957e-05, "loss": 0.2482, "step": 250 }, { "epoch": 0.16908049848433815, "grad_norm": 0.6678220629692078, "learning_rate": 1.4784978118747404e-05, "loss": 0.2858, "step": 251 }, { "epoch": 0.1697541259683395, "grad_norm": 0.7072235345840454, "learning_rate": 1.4780762098118564e-05, "loss": 0.317, "step": 252 }, { "epoch": 0.17042775345234085, "grad_norm": 0.6481045484542847, "learning_rate": 1.4776505758108901e-05, "loss": 0.3074, "step": 253 }, { "epoch": 0.1711013809363422, "grad_norm": 0.573128342628479, "learning_rate": 1.477220912228916e-05, "loss": 0.2421, "step": 254 }, { "epoch": 0.17177500842034354, "grad_norm": 0.5758487582206726, "learning_rate": 1.4767872214453241e-05, "loss": 0.2874, "step": 255 }, { "epoch": 0.1724486359043449, "grad_norm": 0.5688092112541199, "learning_rate": 1.4763495058618056e-05, "loss": 0.2897, "step": 256 }, { "epoch": 0.17312226338834624, "grad_norm": 0.607288658618927, "learning_rate": 1.4759077679023406e-05, "loss": 0.2707, "step": 257 }, { "epoch": 0.1737958908723476, "grad_norm": 0.6363064646720886, "learning_rate": 1.4754620100131838e-05, "loss": 0.2977, "step": 258 }, { "epoch": 0.17446951835634894, "grad_norm": 0.6312716007232666, "learning_rate": 1.475012234662852e-05, "loss": 0.2794, "step": 259 }, { "epoch": 0.1751431458403503, "grad_norm": 0.6589624285697937, "learning_rate": 1.4745584443421097e-05, "loss": 0.3483, "step": 260 }, { "epoch": 0.17581677332435164, "grad_norm": 0.5797691345214844, "learning_rate": 1.4741006415639555e-05, "loss": 0.3013, "step": 261 }, { "epoch": 0.176490400808353, "grad_norm": 0.5717487335205078, "learning_rate": 1.473638828863608e-05, "loss": 0.2725, "step": 262 }, { "epoch": 0.17716402829235434, "grad_norm": 0.6161592602729797, "learning_rate": 1.4731730087984924e-05, "loss": 0.3049, "step": 263 }, { "epoch": 0.17783765577635569, "grad_norm": 0.6334370970726013, "learning_rate": 1.4727031839482251e-05, "loss": 0.2844, "step": 264 }, { "epoch": 0.17851128326035703, "grad_norm": 0.576859176158905, "learning_rate": 1.472229356914601e-05, "loss": 0.244, "step": 265 }, { "epoch": 0.17918491074435838, "grad_norm": 0.6241918802261353, "learning_rate": 1.4717515303215776e-05, "loss": 0.2838, "step": 266 }, { "epoch": 0.1798585382283597, "grad_norm": 0.5989061594009399, "learning_rate": 1.4712697068152619e-05, "loss": 0.2984, "step": 267 }, { "epoch": 0.18053216571236105, "grad_norm": 0.5685368180274963, "learning_rate": 1.4707838890638941e-05, "loss": 0.2787, "step": 268 }, { "epoch": 0.1812057931963624, "grad_norm": 0.6349403262138367, "learning_rate": 1.4702940797578345e-05, "loss": 0.3078, "step": 269 }, { "epoch": 0.18187942068036375, "grad_norm": 0.6529637575149536, "learning_rate": 1.4698002816095473e-05, "loss": 0.307, "step": 270 }, { "epoch": 0.1825530481643651, "grad_norm": 0.5679558515548706, "learning_rate": 1.4693024973535863e-05, "loss": 0.25, "step": 271 }, { "epoch": 0.18322667564836645, "grad_norm": 0.5999310612678528, "learning_rate": 1.4688007297465796e-05, "loss": 0.259, "step": 272 }, { "epoch": 0.1839003031323678, "grad_norm": 0.6034629344940186, "learning_rate": 1.4682949815672146e-05, "loss": 0.3071, "step": 273 }, { "epoch": 0.18457393061636915, "grad_norm": 0.610670268535614, "learning_rate": 1.467785255616221e-05, "loss": 0.2913, "step": 274 }, { "epoch": 0.1852475581003705, "grad_norm": 0.628016471862793, "learning_rate": 1.4672715547163584e-05, "loss": 0.2839, "step": 275 }, { "epoch": 0.18592118558437185, "grad_norm": 0.6297721862792969, "learning_rate": 1.4667538817123977e-05, "loss": 0.3403, "step": 276 }, { "epoch": 0.1865948130683732, "grad_norm": 0.540552020072937, "learning_rate": 1.4662322394711067e-05, "loss": 0.2454, "step": 277 }, { "epoch": 0.18726844055237454, "grad_norm": 0.513788640499115, "learning_rate": 1.4657066308812342e-05, "loss": 0.233, "step": 278 }, { "epoch": 0.1879420680363759, "grad_norm": 0.6221415996551514, "learning_rate": 1.4651770588534937e-05, "loss": 0.2969, "step": 279 }, { "epoch": 0.18861569552037724, "grad_norm": 0.5859697461128235, "learning_rate": 1.4646435263205475e-05, "loss": 0.2771, "step": 280 }, { "epoch": 0.1892893230043786, "grad_norm": 0.5720670819282532, "learning_rate": 1.4641060362369904e-05, "loss": 0.2758, "step": 281 }, { "epoch": 0.18996295048837994, "grad_norm": 0.5609393119812012, "learning_rate": 1.4635645915793333e-05, "loss": 0.256, "step": 282 }, { "epoch": 0.19063657797238126, "grad_norm": 0.5734854340553284, "learning_rate": 1.4630191953459862e-05, "loss": 0.3233, "step": 283 }, { "epoch": 0.1913102054563826, "grad_norm": 0.570590615272522, "learning_rate": 1.4624698505572432e-05, "loss": 0.2757, "step": 284 }, { "epoch": 0.19198383294038396, "grad_norm": 0.623126208782196, "learning_rate": 1.4619165602552637e-05, "loss": 0.2964, "step": 285 }, { "epoch": 0.1926574604243853, "grad_norm": 0.5599439144134521, "learning_rate": 1.4613593275040572e-05, "loss": 0.2582, "step": 286 }, { "epoch": 0.19333108790838666, "grad_norm": 0.5614957809448242, "learning_rate": 1.4607981553894654e-05, "loss": 0.27, "step": 287 }, { "epoch": 0.194004715392388, "grad_norm": 0.5625648498535156, "learning_rate": 1.4602330470191453e-05, "loss": 0.2751, "step": 288 }, { "epoch": 0.19467834287638935, "grad_norm": 0.5504026412963867, "learning_rate": 1.4596640055225521e-05, "loss": 0.2429, "step": 289 }, { "epoch": 0.1953519703603907, "grad_norm": 0.5794048309326172, "learning_rate": 1.4590910340509224e-05, "loss": 0.2882, "step": 290 }, { "epoch": 0.19602559784439205, "grad_norm": 0.550942599773407, "learning_rate": 1.4585141357772554e-05, "loss": 0.2604, "step": 291 }, { "epoch": 0.1966992253283934, "grad_norm": 0.6088408827781677, "learning_rate": 1.4579333138962966e-05, "loss": 0.2993, "step": 292 }, { "epoch": 0.19737285281239475, "grad_norm": 0.6309805512428284, "learning_rate": 1.4573485716245193e-05, "loss": 0.297, "step": 293 }, { "epoch": 0.1980464802963961, "grad_norm": 0.6433154344558716, "learning_rate": 1.456759912200108e-05, "loss": 0.2919, "step": 294 }, { "epoch": 0.19872010778039745, "grad_norm": 0.6373067498207092, "learning_rate": 1.456167338882938e-05, "loss": 0.2719, "step": 295 }, { "epoch": 0.1993937352643988, "grad_norm": 0.5514649748802185, "learning_rate": 1.4555708549545607e-05, "loss": 0.2638, "step": 296 }, { "epoch": 0.20006736274840015, "grad_norm": 0.5804110169410706, "learning_rate": 1.4549704637181827e-05, "loss": 0.2828, "step": 297 }, { "epoch": 0.2007409902324015, "grad_norm": 0.5397315621376038, "learning_rate": 1.4543661684986484e-05, "loss": 0.2712, "step": 298 }, { "epoch": 0.20141461771640282, "grad_norm": 0.6435424089431763, "learning_rate": 1.4537579726424221e-05, "loss": 0.3095, "step": 299 }, { "epoch": 0.20208824520040417, "grad_norm": 0.5241397023200989, "learning_rate": 1.453145879517569e-05, "loss": 0.2635, "step": 300 }, { "epoch": 0.20208824520040417, "eval_loss": 0.2736159861087799, "eval_runtime": 107.1602, "eval_samples_per_second": 46.659, "eval_steps_per_second": 2.921, "step": 300 }, { "epoch": 0.20276187268440551, "grad_norm": 0.5774008631706238, "learning_rate": 1.4525298925137362e-05, "loss": 0.2752, "step": 301 }, { "epoch": 0.20343550016840686, "grad_norm": 0.5994575619697571, "learning_rate": 1.4519100150421343e-05, "loss": 0.3073, "step": 302 }, { "epoch": 0.2041091276524082, "grad_norm": 0.5691470503807068, "learning_rate": 1.4512862505355195e-05, "loss": 0.2846, "step": 303 }, { "epoch": 0.20478275513640956, "grad_norm": 0.5722606182098389, "learning_rate": 1.450658602448172e-05, "loss": 0.2549, "step": 304 }, { "epoch": 0.2054563826204109, "grad_norm": 0.632279634475708, "learning_rate": 1.45002707425588e-05, "loss": 0.3197, "step": 305 }, { "epoch": 0.20613001010441226, "grad_norm": 0.5538962483406067, "learning_rate": 1.449391669455918e-05, "loss": 0.2656, "step": 306 }, { "epoch": 0.2068036375884136, "grad_norm": 0.5925297737121582, "learning_rate": 1.4487523915670286e-05, "loss": 0.2821, "step": 307 }, { "epoch": 0.20747726507241496, "grad_norm": 0.6299713850021362, "learning_rate": 1.448109244129403e-05, "loss": 0.3116, "step": 308 }, { "epoch": 0.2081508925564163, "grad_norm": 0.6114513874053955, "learning_rate": 1.447462230704661e-05, "loss": 0.285, "step": 309 }, { "epoch": 0.20882452004041765, "grad_norm": 0.5723987817764282, "learning_rate": 1.4468113548758313e-05, "loss": 0.278, "step": 310 }, { "epoch": 0.209498147524419, "grad_norm": 0.5769573450088501, "learning_rate": 1.4461566202473322e-05, "loss": 0.2892, "step": 311 }, { "epoch": 0.21017177500842035, "grad_norm": 0.6040593981742859, "learning_rate": 1.4454980304449506e-05, "loss": 0.3123, "step": 312 }, { "epoch": 0.2108454024924217, "grad_norm": 0.5362566113471985, "learning_rate": 1.4448355891158235e-05, "loss": 0.24, "step": 313 }, { "epoch": 0.21151902997642305, "grad_norm": 0.560070812702179, "learning_rate": 1.4441692999284159e-05, "loss": 0.2663, "step": 314 }, { "epoch": 0.21219265746042437, "grad_norm": 0.6649965047836304, "learning_rate": 1.443499166572502e-05, "loss": 0.3441, "step": 315 }, { "epoch": 0.21286628494442572, "grad_norm": 0.5337359309196472, "learning_rate": 1.4428251927591445e-05, "loss": 0.253, "step": 316 }, { "epoch": 0.21353991242842707, "grad_norm": 0.6185274720191956, "learning_rate": 1.4421473822206729e-05, "loss": 0.305, "step": 317 }, { "epoch": 0.21421353991242842, "grad_norm": 0.5125210881233215, "learning_rate": 1.4414657387106646e-05, "loss": 0.2774, "step": 318 }, { "epoch": 0.21488716739642977, "grad_norm": 0.5758813619613647, "learning_rate": 1.4407802660039226e-05, "loss": 0.2484, "step": 319 }, { "epoch": 0.21556079488043112, "grad_norm": 0.5220269560813904, "learning_rate": 1.4400909678964556e-05, "loss": 0.2399, "step": 320 }, { "epoch": 0.21623442236443247, "grad_norm": 0.5919392704963684, "learning_rate": 1.4393978482054561e-05, "loss": 0.2924, "step": 321 }, { "epoch": 0.21690804984843381, "grad_norm": 0.5359899997711182, "learning_rate": 1.4387009107692808e-05, "loss": 0.2493, "step": 322 }, { "epoch": 0.21758167733243516, "grad_norm": 0.568356454372406, "learning_rate": 1.4380001594474267e-05, "loss": 0.2877, "step": 323 }, { "epoch": 0.2182553048164365, "grad_norm": 0.5183501243591309, "learning_rate": 1.4372955981205127e-05, "loss": 0.262, "step": 324 }, { "epoch": 0.21892893230043786, "grad_norm": 0.5353648662567139, "learning_rate": 1.436587230690256e-05, "loss": 0.269, "step": 325 }, { "epoch": 0.2196025597844392, "grad_norm": 0.5863710641860962, "learning_rate": 1.4358750610794522e-05, "loss": 0.2933, "step": 326 }, { "epoch": 0.22027618726844056, "grad_norm": 0.5193360447883606, "learning_rate": 1.4351590932319506e-05, "loss": 0.2539, "step": 327 }, { "epoch": 0.2209498147524419, "grad_norm": 0.521597146987915, "learning_rate": 1.4344393311126367e-05, "loss": 0.24, "step": 328 }, { "epoch": 0.22162344223644326, "grad_norm": 0.5621289014816284, "learning_rate": 1.4337157787074063e-05, "loss": 0.2647, "step": 329 }, { "epoch": 0.2222970697204446, "grad_norm": 0.6134183406829834, "learning_rate": 1.432988440023146e-05, "loss": 0.2846, "step": 330 }, { "epoch": 0.22297069720444593, "grad_norm": 0.5819990634918213, "learning_rate": 1.4322573190877091e-05, "loss": 0.2725, "step": 331 }, { "epoch": 0.22364432468844728, "grad_norm": 0.6009438037872314, "learning_rate": 1.4315224199498952e-05, "loss": 0.2507, "step": 332 }, { "epoch": 0.22431795217244863, "grad_norm": 0.5484105944633484, "learning_rate": 1.4307837466794258e-05, "loss": 0.2715, "step": 333 }, { "epoch": 0.22499157965644997, "grad_norm": 0.5025244951248169, "learning_rate": 1.4300413033669241e-05, "loss": 0.2257, "step": 334 }, { "epoch": 0.22566520714045132, "grad_norm": 0.5583484172821045, "learning_rate": 1.4292950941238898e-05, "loss": 0.3015, "step": 335 }, { "epoch": 0.22633883462445267, "grad_norm": 0.5975006222724915, "learning_rate": 1.4285451230826783e-05, "loss": 0.2924, "step": 336 }, { "epoch": 0.22701246210845402, "grad_norm": 0.6017155051231384, "learning_rate": 1.4277913943964763e-05, "loss": 0.2928, "step": 337 }, { "epoch": 0.22768608959245537, "grad_norm": 0.5619384050369263, "learning_rate": 1.4270339122392808e-05, "loss": 0.2744, "step": 338 }, { "epoch": 0.22835971707645672, "grad_norm": 0.576554536819458, "learning_rate": 1.4262726808058735e-05, "loss": 0.3019, "step": 339 }, { "epoch": 0.22903334456045807, "grad_norm": 0.5621641874313354, "learning_rate": 1.4255077043117994e-05, "loss": 0.2801, "step": 340 }, { "epoch": 0.22970697204445942, "grad_norm": 0.5104705095291138, "learning_rate": 1.424738986993343e-05, "loss": 0.2572, "step": 341 }, { "epoch": 0.23038059952846077, "grad_norm": 0.5731213688850403, "learning_rate": 1.4239665331075048e-05, "loss": 0.2545, "step": 342 }, { "epoch": 0.23105422701246212, "grad_norm": 0.6381127238273621, "learning_rate": 1.4231903469319772e-05, "loss": 0.3023, "step": 343 }, { "epoch": 0.23172785449646346, "grad_norm": 0.5358138680458069, "learning_rate": 1.4224104327651213e-05, "loss": 0.2597, "step": 344 }, { "epoch": 0.2324014819804648, "grad_norm": 0.5517827272415161, "learning_rate": 1.4216267949259437e-05, "loss": 0.2669, "step": 345 }, { "epoch": 0.23307510946446616, "grad_norm": 0.5380638241767883, "learning_rate": 1.4208394377540712e-05, "loss": 0.2706, "step": 346 }, { "epoch": 0.23374873694846748, "grad_norm": 0.6162987351417542, "learning_rate": 1.4200483656097278e-05, "loss": 0.2721, "step": 347 }, { "epoch": 0.23442236443246883, "grad_norm": 0.6142714619636536, "learning_rate": 1.4192535828737102e-05, "loss": 0.3158, "step": 348 }, { "epoch": 0.23509599191647018, "grad_norm": 0.6231828331947327, "learning_rate": 1.4184550939473644e-05, "loss": 0.3022, "step": 349 }, { "epoch": 0.23576961940047153, "grad_norm": 0.5371239185333252, "learning_rate": 1.4176529032525584e-05, "loss": 0.2372, "step": 350 }, { "epoch": 0.23644324688447288, "grad_norm": 0.5987442135810852, "learning_rate": 1.4168470152316624e-05, "loss": 0.2856, "step": 351 }, { "epoch": 0.23711687436847423, "grad_norm": 0.5490831732749939, "learning_rate": 1.41603743434752e-05, "loss": 0.2352, "step": 352 }, { "epoch": 0.23779050185247558, "grad_norm": 0.5611885786056519, "learning_rate": 1.415224165083426e-05, "loss": 0.2763, "step": 353 }, { "epoch": 0.23846412933647693, "grad_norm": 0.5451275706291199, "learning_rate": 1.4144072119431e-05, "loss": 0.2725, "step": 354 }, { "epoch": 0.23913775682047828, "grad_norm": 0.5789247155189514, "learning_rate": 1.413586579450662e-05, "loss": 0.2604, "step": 355 }, { "epoch": 0.23981138430447962, "grad_norm": 0.6164606213569641, "learning_rate": 1.4127622721506087e-05, "loss": 0.2932, "step": 356 }, { "epoch": 0.24048501178848097, "grad_norm": 0.5564325451850891, "learning_rate": 1.4119342946077864e-05, "loss": 0.2735, "step": 357 }, { "epoch": 0.24115863927248232, "grad_norm": 0.6473014950752258, "learning_rate": 1.4111026514073657e-05, "loss": 0.2808, "step": 358 }, { "epoch": 0.24183226675648367, "grad_norm": 0.5950415730476379, "learning_rate": 1.4102673471548186e-05, "loss": 0.2819, "step": 359 }, { "epoch": 0.24250589424048502, "grad_norm": 0.576295793056488, "learning_rate": 1.4094283864758896e-05, "loss": 0.2818, "step": 360 }, { "epoch": 0.24317952172448637, "grad_norm": 0.5290201306343079, "learning_rate": 1.4085857740165727e-05, "loss": 0.2731, "step": 361 }, { "epoch": 0.24385314920848772, "grad_norm": 0.5469079613685608, "learning_rate": 1.4077395144430845e-05, "loss": 0.2533, "step": 362 }, { "epoch": 0.24452677669248907, "grad_norm": 0.553629457950592, "learning_rate": 1.4068896124418383e-05, "loss": 0.2784, "step": 363 }, { "epoch": 0.2452004041764904, "grad_norm": 0.5426369905471802, "learning_rate": 1.4060360727194188e-05, "loss": 0.2687, "step": 364 }, { "epoch": 0.24587403166049174, "grad_norm": 0.5466113686561584, "learning_rate": 1.4051789000025555e-05, "loss": 0.2721, "step": 365 }, { "epoch": 0.2465476591444931, "grad_norm": 0.5685258507728577, "learning_rate": 1.4043180990380968e-05, "loss": 0.283, "step": 366 }, { "epoch": 0.24722128662849444, "grad_norm": 0.5648797154426575, "learning_rate": 1.4034536745929835e-05, "loss": 0.2579, "step": 367 }, { "epoch": 0.24789491411249578, "grad_norm": 0.5363840460777283, "learning_rate": 1.4025856314542223e-05, "loss": 0.2577, "step": 368 }, { "epoch": 0.24856854159649713, "grad_norm": 0.5171375870704651, "learning_rate": 1.40171397442886e-05, "loss": 0.2351, "step": 369 }, { "epoch": 0.24924216908049848, "grad_norm": 0.646500825881958, "learning_rate": 1.4008387083439554e-05, "loss": 0.3039, "step": 370 }, { "epoch": 0.24991579656449983, "grad_norm": 0.5827479362487793, "learning_rate": 1.3999598380465552e-05, "loss": 0.2913, "step": 371 }, { "epoch": 0.25058942404850115, "grad_norm": 0.5602329969406128, "learning_rate": 1.3990773684036636e-05, "loss": 0.2822, "step": 372 }, { "epoch": 0.2512630515325025, "grad_norm": 0.5731973648071289, "learning_rate": 1.3981913043022187e-05, "loss": 0.2638, "step": 373 }, { "epoch": 0.25193667901650385, "grad_norm": 0.6127945780754089, "learning_rate": 1.397301650649063e-05, "loss": 0.314, "step": 374 }, { "epoch": 0.2526103065005052, "grad_norm": 0.5554071664810181, "learning_rate": 1.396408412370918e-05, "loss": 0.2575, "step": 375 }, { "epoch": 0.25328393398450655, "grad_norm": 0.5913053750991821, "learning_rate": 1.3955115944143558e-05, "loss": 0.2669, "step": 376 }, { "epoch": 0.2539575614685079, "grad_norm": 0.6104479432106018, "learning_rate": 1.3946112017457715e-05, "loss": 0.2575, "step": 377 }, { "epoch": 0.25463118895250925, "grad_norm": 0.6109972596168518, "learning_rate": 1.393707239351357e-05, "loss": 0.3141, "step": 378 }, { "epoch": 0.2553048164365106, "grad_norm": 0.605560302734375, "learning_rate": 1.3927997122370724e-05, "loss": 0.2869, "step": 379 }, { "epoch": 0.25597844392051194, "grad_norm": 0.5215985774993896, "learning_rate": 1.3918886254286182e-05, "loss": 0.2464, "step": 380 }, { "epoch": 0.2566520714045133, "grad_norm": 0.5480206608772278, "learning_rate": 1.3909739839714081e-05, "loss": 0.2713, "step": 381 }, { "epoch": 0.25732569888851464, "grad_norm": 0.5150758028030396, "learning_rate": 1.3900557929305408e-05, "loss": 0.2537, "step": 382 }, { "epoch": 0.257999326372516, "grad_norm": 0.606860876083374, "learning_rate": 1.3891340573907715e-05, "loss": 0.2929, "step": 383 }, { "epoch": 0.25867295385651734, "grad_norm": 0.5383312106132507, "learning_rate": 1.3882087824564841e-05, "loss": 0.2778, "step": 384 }, { "epoch": 0.2593465813405187, "grad_norm": 0.5356404185295105, "learning_rate": 1.3872799732516635e-05, "loss": 0.2318, "step": 385 }, { "epoch": 0.26002020882452004, "grad_norm": 0.5665723085403442, "learning_rate": 1.386347634919866e-05, "loss": 0.2898, "step": 386 }, { "epoch": 0.2606938363085214, "grad_norm": 0.5390300750732422, "learning_rate": 1.3854117726241922e-05, "loss": 0.2789, "step": 387 }, { "epoch": 0.26136746379252274, "grad_norm": 0.5479271411895752, "learning_rate": 1.3844723915472568e-05, "loss": 0.2552, "step": 388 }, { "epoch": 0.2620410912765241, "grad_norm": 0.6038428544998169, "learning_rate": 1.3835294968911615e-05, "loss": 0.3018, "step": 389 }, { "epoch": 0.26271471876052543, "grad_norm": 0.5380761027336121, "learning_rate": 1.3825830938774653e-05, "loss": 0.2683, "step": 390 }, { "epoch": 0.2633883462445268, "grad_norm": 0.5072317719459534, "learning_rate": 1.3816331877471562e-05, "loss": 0.2728, "step": 391 }, { "epoch": 0.26406197372852813, "grad_norm": 0.5953329205513, "learning_rate": 1.3806797837606206e-05, "loss": 0.2644, "step": 392 }, { "epoch": 0.2647356012125295, "grad_norm": 0.5941304564476013, "learning_rate": 1.3797228871976162e-05, "loss": 0.2841, "step": 393 }, { "epoch": 0.26540922869653083, "grad_norm": 0.6646502614021301, "learning_rate": 1.378762503357242e-05, "loss": 0.2966, "step": 394 }, { "epoch": 0.2660828561805322, "grad_norm": 0.545456051826477, "learning_rate": 1.377798637557908e-05, "loss": 0.2481, "step": 395 }, { "epoch": 0.26675648366453353, "grad_norm": 0.5886520147323608, "learning_rate": 1.3768312951373076e-05, "loss": 0.2735, "step": 396 }, { "epoch": 0.2674301111485349, "grad_norm": 0.5731514096260071, "learning_rate": 1.3758604814523863e-05, "loss": 0.2953, "step": 397 }, { "epoch": 0.2681037386325362, "grad_norm": 0.5029922723770142, "learning_rate": 1.3748862018793131e-05, "loss": 0.228, "step": 398 }, { "epoch": 0.2687773661165376, "grad_norm": 0.557115375995636, "learning_rate": 1.3739084618134502e-05, "loss": 0.2861, "step": 399 }, { "epoch": 0.2694509936005389, "grad_norm": 0.5246098041534424, "learning_rate": 1.3729272666693235e-05, "loss": 0.2705, "step": 400 }, { "epoch": 0.2694509936005389, "eval_loss": 0.2706840932369232, "eval_runtime": 105.373, "eval_samples_per_second": 47.451, "eval_steps_per_second": 2.97, "step": 400 }, { "epoch": 0.2701246210845403, "grad_norm": 0.5355361104011536, "learning_rate": 1.371942621880592e-05, "loss": 0.249, "step": 401 }, { "epoch": 0.2707982485685416, "grad_norm": 0.5726237297058105, "learning_rate": 1.3709545329000187e-05, "loss": 0.2849, "step": 402 }, { "epoch": 0.27147187605254297, "grad_norm": 0.5560792088508606, "learning_rate": 1.3699630051994395e-05, "loss": 0.2397, "step": 403 }, { "epoch": 0.27214550353654426, "grad_norm": 0.509462833404541, "learning_rate": 1.3689680442697332e-05, "loss": 0.2412, "step": 404 }, { "epoch": 0.2728191310205456, "grad_norm": 0.5348261594772339, "learning_rate": 1.3679696556207913e-05, "loss": 0.2588, "step": 405 }, { "epoch": 0.27349275850454696, "grad_norm": 0.5228528380393982, "learning_rate": 1.3669678447814871e-05, "loss": 0.2482, "step": 406 }, { "epoch": 0.2741663859885483, "grad_norm": 0.5533547401428223, "learning_rate": 1.3659626172996459e-05, "loss": 0.2581, "step": 407 }, { "epoch": 0.27484001347254966, "grad_norm": 0.538163959980011, "learning_rate": 1.3649539787420126e-05, "loss": 0.2444, "step": 408 }, { "epoch": 0.275513640956551, "grad_norm": 0.6091170907020569, "learning_rate": 1.3639419346942227e-05, "loss": 0.2963, "step": 409 }, { "epoch": 0.27618726844055236, "grad_norm": 0.5507506728172302, "learning_rate": 1.3629264907607709e-05, "loss": 0.2835, "step": 410 }, { "epoch": 0.2768608959245537, "grad_norm": 0.5167334079742432, "learning_rate": 1.361907652564979e-05, "loss": 0.2751, "step": 411 }, { "epoch": 0.27753452340855506, "grad_norm": 0.6182762384414673, "learning_rate": 1.3608854257489656e-05, "loss": 0.2953, "step": 412 }, { "epoch": 0.2782081508925564, "grad_norm": 0.6356998085975647, "learning_rate": 1.3598598159736155e-05, "loss": 0.2586, "step": 413 }, { "epoch": 0.27888177837655775, "grad_norm": 0.5957326889038086, "learning_rate": 1.358830828918547e-05, "loss": 0.283, "step": 414 }, { "epoch": 0.2795554058605591, "grad_norm": 0.5173368453979492, "learning_rate": 1.3577984702820811e-05, "loss": 0.2403, "step": 415 }, { "epoch": 0.28022903334456045, "grad_norm": 0.5449368357658386, "learning_rate": 1.3567627457812107e-05, "loss": 0.2641, "step": 416 }, { "epoch": 0.2809026608285618, "grad_norm": 0.6340479850769043, "learning_rate": 1.355723661151567e-05, "loss": 0.3286, "step": 417 }, { "epoch": 0.28157628831256315, "grad_norm": 0.49671491980552673, "learning_rate": 1.3546812221473898e-05, "loss": 0.2585, "step": 418 }, { "epoch": 0.2822499157965645, "grad_norm": 0.5974727272987366, "learning_rate": 1.3536354345414944e-05, "loss": 0.2674, "step": 419 }, { "epoch": 0.28292354328056585, "grad_norm": 0.5984825491905212, "learning_rate": 1.35258630412524e-05, "loss": 0.2548, "step": 420 }, { "epoch": 0.2835971707645672, "grad_norm": 0.5152942538261414, "learning_rate": 1.3515338367084975e-05, "loss": 0.2323, "step": 421 }, { "epoch": 0.28427079824856855, "grad_norm": 0.5210486054420471, "learning_rate": 1.3504780381196178e-05, "loss": 0.2538, "step": 422 }, { "epoch": 0.2849444257325699, "grad_norm": 0.6852086782455444, "learning_rate": 1.3494189142053988e-05, "loss": 0.3409, "step": 423 }, { "epoch": 0.28561805321657124, "grad_norm": 0.5637288689613342, "learning_rate": 1.3483564708310535e-05, "loss": 0.2435, "step": 424 }, { "epoch": 0.2862916807005726, "grad_norm": 0.565467357635498, "learning_rate": 1.3472907138801775e-05, "loss": 0.2699, "step": 425 }, { "epoch": 0.28696530818457394, "grad_norm": 0.6443371176719666, "learning_rate": 1.346221649254716e-05, "loss": 0.3226, "step": 426 }, { "epoch": 0.2876389356685753, "grad_norm": 0.5877301096916199, "learning_rate": 1.3451492828749317e-05, "loss": 0.2626, "step": 427 }, { "epoch": 0.28831256315257664, "grad_norm": 0.635368824005127, "learning_rate": 1.3440736206793717e-05, "loss": 0.2808, "step": 428 }, { "epoch": 0.288986190636578, "grad_norm": 0.5623096823692322, "learning_rate": 1.3429946686248346e-05, "loss": 0.2583, "step": 429 }, { "epoch": 0.28965981812057934, "grad_norm": 0.5355499386787415, "learning_rate": 1.341912432686338e-05, "loss": 0.2425, "step": 430 }, { "epoch": 0.2903334456045807, "grad_norm": 0.5870991349220276, "learning_rate": 1.3408269188570837e-05, "loss": 0.2638, "step": 431 }, { "epoch": 0.29100707308858204, "grad_norm": 0.5296127796173096, "learning_rate": 1.3397381331484273e-05, "loss": 0.2587, "step": 432 }, { "epoch": 0.2916807005725834, "grad_norm": 0.5635933876037598, "learning_rate": 1.3386460815898427e-05, "loss": 0.2966, "step": 433 }, { "epoch": 0.29235432805658473, "grad_norm": 0.5246622562408447, "learning_rate": 1.3375507702288894e-05, "loss": 0.2513, "step": 434 }, { "epoch": 0.2930279555405861, "grad_norm": 0.6050205826759338, "learning_rate": 1.3364522051311793e-05, "loss": 0.3016, "step": 435 }, { "epoch": 0.2937015830245874, "grad_norm": 0.5831138491630554, "learning_rate": 1.3353503923803424e-05, "loss": 0.312, "step": 436 }, { "epoch": 0.2943752105085887, "grad_norm": 0.5354754328727722, "learning_rate": 1.3342453380779939e-05, "loss": 0.2743, "step": 437 }, { "epoch": 0.2950488379925901, "grad_norm": 0.6059128642082214, "learning_rate": 1.3331370483437e-05, "loss": 0.2836, "step": 438 }, { "epoch": 0.2957224654765914, "grad_norm": 0.6208754181861877, "learning_rate": 1.332025529314944e-05, "loss": 0.3069, "step": 439 }, { "epoch": 0.29639609296059277, "grad_norm": 0.5791683197021484, "learning_rate": 1.3309107871470922e-05, "loss": 0.2904, "step": 440 }, { "epoch": 0.2970697204445941, "grad_norm": 0.5765690803527832, "learning_rate": 1.3297928280133606e-05, "loss": 0.3015, "step": 441 }, { "epoch": 0.29774334792859547, "grad_norm": 0.5978572368621826, "learning_rate": 1.3286716581047791e-05, "loss": 0.2827, "step": 442 }, { "epoch": 0.2984169754125968, "grad_norm": 0.5690959692001343, "learning_rate": 1.3275472836301592e-05, "loss": 0.2819, "step": 443 }, { "epoch": 0.29909060289659817, "grad_norm": 0.5888264775276184, "learning_rate": 1.3264197108160582e-05, "loss": 0.297, "step": 444 }, { "epoch": 0.2997642303805995, "grad_norm": 0.566338837146759, "learning_rate": 1.3252889459067452e-05, "loss": 0.2703, "step": 445 }, { "epoch": 0.30043785786460087, "grad_norm": 0.5249893665313721, "learning_rate": 1.3241549951641663e-05, "loss": 0.252, "step": 446 }, { "epoch": 0.3011114853486022, "grad_norm": 0.6007825136184692, "learning_rate": 1.3230178648679102e-05, "loss": 0.2696, "step": 447 }, { "epoch": 0.30178511283260356, "grad_norm": 0.5482873916625977, "learning_rate": 1.3218775613151737e-05, "loss": 0.2523, "step": 448 }, { "epoch": 0.3024587403166049, "grad_norm": 0.6056886315345764, "learning_rate": 1.3207340908207258e-05, "loss": 0.2616, "step": 449 }, { "epoch": 0.30313236780060626, "grad_norm": 0.5885447859764099, "learning_rate": 1.319587459716874e-05, "loss": 0.2976, "step": 450 }, { "epoch": 0.3038059952846076, "grad_norm": 0.5747894644737244, "learning_rate": 1.318437674353428e-05, "loss": 0.2898, "step": 451 }, { "epoch": 0.30447962276860896, "grad_norm": 0.569401741027832, "learning_rate": 1.3172847410976658e-05, "loss": 0.3104, "step": 452 }, { "epoch": 0.3051532502526103, "grad_norm": 0.5612210631370544, "learning_rate": 1.3161286663342972e-05, "loss": 0.2825, "step": 453 }, { "epoch": 0.30582687773661166, "grad_norm": 0.5914261937141418, "learning_rate": 1.3149694564654295e-05, "loss": 0.2781, "step": 454 }, { "epoch": 0.306500505220613, "grad_norm": 0.5259233713150024, "learning_rate": 1.3138071179105314e-05, "loss": 0.2542, "step": 455 }, { "epoch": 0.30717413270461436, "grad_norm": 0.5168178081512451, "learning_rate": 1.3126416571063972e-05, "loss": 0.2514, "step": 456 }, { "epoch": 0.3078477601886157, "grad_norm": 0.5078200101852417, "learning_rate": 1.3114730805071123e-05, "loss": 0.2422, "step": 457 }, { "epoch": 0.30852138767261705, "grad_norm": 0.5727274417877197, "learning_rate": 1.3103013945840166e-05, "loss": 0.2809, "step": 458 }, { "epoch": 0.3091950151566184, "grad_norm": 0.5502845048904419, "learning_rate": 1.309126605825668e-05, "loss": 0.2552, "step": 459 }, { "epoch": 0.30986864264061975, "grad_norm": 0.5696067214012146, "learning_rate": 1.3079487207378084e-05, "loss": 0.2959, "step": 460 }, { "epoch": 0.3105422701246211, "grad_norm": 0.5644879341125488, "learning_rate": 1.3067677458433258e-05, "loss": 0.2713, "step": 461 }, { "epoch": 0.31121589760862245, "grad_norm": 0.5638664364814758, "learning_rate": 1.3055836876822196e-05, "loss": 0.2687, "step": 462 }, { "epoch": 0.3118895250926238, "grad_norm": 0.5337838530540466, "learning_rate": 1.3043965528115625e-05, "loss": 0.2238, "step": 463 }, { "epoch": 0.31256315257662515, "grad_norm": 0.5844706892967224, "learning_rate": 1.3032063478054666e-05, "loss": 0.268, "step": 464 }, { "epoch": 0.3132367800606265, "grad_norm": 0.6730402112007141, "learning_rate": 1.3020130792550456e-05, "loss": 0.2976, "step": 465 }, { "epoch": 0.31391040754462785, "grad_norm": 0.5756520628929138, "learning_rate": 1.3008167537683776e-05, "loss": 0.2859, "step": 466 }, { "epoch": 0.3145840350286292, "grad_norm": 0.5855886340141296, "learning_rate": 1.2996173779704704e-05, "loss": 0.2997, "step": 467 }, { "epoch": 0.3152576625126305, "grad_norm": 0.5359857082366943, "learning_rate": 1.2984149585032237e-05, "loss": 0.2814, "step": 468 }, { "epoch": 0.31593128999663184, "grad_norm": 0.5448024868965149, "learning_rate": 1.2972095020253912e-05, "loss": 0.2681, "step": 469 }, { "epoch": 0.3166049174806332, "grad_norm": 0.518844723701477, "learning_rate": 1.296001015212547e-05, "loss": 0.2538, "step": 470 }, { "epoch": 0.31727854496463453, "grad_norm": 0.5422329306602478, "learning_rate": 1.2947895047570446e-05, "loss": 0.2346, "step": 471 }, { "epoch": 0.3179521724486359, "grad_norm": 0.5567420721054077, "learning_rate": 1.2935749773679833e-05, "loss": 0.259, "step": 472 }, { "epoch": 0.31862579993263723, "grad_norm": 0.5199055671691895, "learning_rate": 1.2923574397711684e-05, "loss": 0.2273, "step": 473 }, { "epoch": 0.3192994274166386, "grad_norm": 0.552947461605072, "learning_rate": 1.291136898709076e-05, "loss": 0.2541, "step": 474 }, { "epoch": 0.31997305490063993, "grad_norm": 0.537124752998352, "learning_rate": 1.2899133609408146e-05, "loss": 0.2709, "step": 475 }, { "epoch": 0.3206466823846413, "grad_norm": 0.5493146777153015, "learning_rate": 1.2886868332420873e-05, "loss": 0.2838, "step": 476 }, { "epoch": 0.32132030986864263, "grad_norm": 0.6109126806259155, "learning_rate": 1.2874573224051556e-05, "loss": 0.3088, "step": 477 }, { "epoch": 0.321993937352644, "grad_norm": 0.5879717469215393, "learning_rate": 1.2862248352388005e-05, "loss": 0.282, "step": 478 }, { "epoch": 0.3226675648366453, "grad_norm": 0.5227838754653931, "learning_rate": 1.2849893785682852e-05, "loss": 0.2646, "step": 479 }, { "epoch": 0.3233411923206467, "grad_norm": 0.4744933545589447, "learning_rate": 1.2837509592353181e-05, "loss": 0.2219, "step": 480 }, { "epoch": 0.324014819804648, "grad_norm": 0.508222758769989, "learning_rate": 1.2825095840980133e-05, "loss": 0.2698, "step": 481 }, { "epoch": 0.3246884472886494, "grad_norm": 0.5351443290710449, "learning_rate": 1.2812652600308544e-05, "loss": 0.2617, "step": 482 }, { "epoch": 0.3253620747726507, "grad_norm": 0.5842475295066833, "learning_rate": 1.2800179939246552e-05, "loss": 0.2496, "step": 483 }, { "epoch": 0.32603570225665207, "grad_norm": 0.5165258646011353, "learning_rate": 1.2787677926865216e-05, "loss": 0.2399, "step": 484 }, { "epoch": 0.3267093297406534, "grad_norm": 0.5721768736839294, "learning_rate": 1.2775146632398142e-05, "loss": 0.2754, "step": 485 }, { "epoch": 0.32738295722465477, "grad_norm": 0.47171083092689514, "learning_rate": 1.2762586125241093e-05, "loss": 0.2107, "step": 486 }, { "epoch": 0.3280565847086561, "grad_norm": 0.5318099856376648, "learning_rate": 1.2749996474951603e-05, "loss": 0.2422, "step": 487 }, { "epoch": 0.32873021219265747, "grad_norm": 0.5478540062904358, "learning_rate": 1.2737377751248598e-05, "loss": 0.2634, "step": 488 }, { "epoch": 0.3294038396766588, "grad_norm": 0.4972551167011261, "learning_rate": 1.2724730024012002e-05, "loss": 0.232, "step": 489 }, { "epoch": 0.33007746716066017, "grad_norm": 0.6141415238380432, "learning_rate": 1.2712053363282363e-05, "loss": 0.2998, "step": 490 }, { "epoch": 0.3307510946446615, "grad_norm": 0.5177733302116394, "learning_rate": 1.2699347839260448e-05, "loss": 0.2574, "step": 491 }, { "epoch": 0.33142472212866286, "grad_norm": 0.5531916618347168, "learning_rate": 1.268661352230687e-05, "loss": 0.2719, "step": 492 }, { "epoch": 0.3320983496126642, "grad_norm": 0.5089963674545288, "learning_rate": 1.2673850482941687e-05, "loss": 0.2508, "step": 493 }, { "epoch": 0.33277197709666556, "grad_norm": 0.557072103023529, "learning_rate": 1.2661058791844016e-05, "loss": 0.2823, "step": 494 }, { "epoch": 0.3334456045806669, "grad_norm": 0.6557756662368774, "learning_rate": 1.2648238519851644e-05, "loss": 0.2821, "step": 495 }, { "epoch": 0.33411923206466826, "grad_norm": 0.5633836984634399, "learning_rate": 1.2635389737960632e-05, "loss": 0.2576, "step": 496 }, { "epoch": 0.3347928595486696, "grad_norm": 0.594456136226654, "learning_rate": 1.262251251732492e-05, "loss": 0.2985, "step": 497 }, { "epoch": 0.33546648703267096, "grad_norm": 0.5753186345100403, "learning_rate": 1.2609606929255942e-05, "loss": 0.2775, "step": 498 }, { "epoch": 0.3361401145166723, "grad_norm": 0.6262162327766418, "learning_rate": 1.259667304522222e-05, "loss": 0.3254, "step": 499 }, { "epoch": 0.33681374200067365, "grad_norm": 0.5529574155807495, "learning_rate": 1.2583710936848977e-05, "loss": 0.2711, "step": 500 }, { "epoch": 0.33681374200067365, "eval_loss": 0.2681807279586792, "eval_runtime": 104.7062, "eval_samples_per_second": 47.753, "eval_steps_per_second": 2.989, "step": 500 }, { "epoch": 0.33748736948467495, "grad_norm": 0.6187270283699036, "learning_rate": 1.2570720675917734e-05, "loss": 0.3082, "step": 501 }, { "epoch": 0.3381609969686763, "grad_norm": 0.5153407454490662, "learning_rate": 1.2557702334365916e-05, "loss": 0.26, "step": 502 }, { "epoch": 0.33883462445267765, "grad_norm": 0.5447744727134705, "learning_rate": 1.2544655984286451e-05, "loss": 0.2641, "step": 503 }, { "epoch": 0.339508251936679, "grad_norm": 0.5450101494789124, "learning_rate": 1.253158169792738e-05, "loss": 0.276, "step": 504 }, { "epoch": 0.34018187942068034, "grad_norm": 0.6855320930480957, "learning_rate": 1.2518479547691437e-05, "loss": 0.3589, "step": 505 }, { "epoch": 0.3408555069046817, "grad_norm": 0.52507483959198, "learning_rate": 1.250534960613567e-05, "loss": 0.2489, "step": 506 }, { "epoch": 0.34152913438868304, "grad_norm": 0.5259436964988708, "learning_rate": 1.2492191945971028e-05, "loss": 0.2568, "step": 507 }, { "epoch": 0.3422027618726844, "grad_norm": 0.5746189951896667, "learning_rate": 1.2479006640061958e-05, "loss": 0.2878, "step": 508 }, { "epoch": 0.34287638935668574, "grad_norm": 0.5484218001365662, "learning_rate": 1.2465793761426005e-05, "loss": 0.3059, "step": 509 }, { "epoch": 0.3435500168406871, "grad_norm": 0.5747763514518738, "learning_rate": 1.24525533832334e-05, "loss": 0.2505, "step": 510 }, { "epoch": 0.34422364432468844, "grad_norm": 0.5692996382713318, "learning_rate": 1.2439285578806678e-05, "loss": 0.3077, "step": 511 }, { "epoch": 0.3448972718086898, "grad_norm": 0.5282084345817566, "learning_rate": 1.2425990421620235e-05, "loss": 0.2763, "step": 512 }, { "epoch": 0.34557089929269114, "grad_norm": 0.4825171232223511, "learning_rate": 1.241266798529995e-05, "loss": 0.2423, "step": 513 }, { "epoch": 0.3462445267766925, "grad_norm": 0.5359032154083252, "learning_rate": 1.239931834362277e-05, "loss": 0.2796, "step": 514 }, { "epoch": 0.34691815426069383, "grad_norm": 0.473827600479126, "learning_rate": 1.2385941570516297e-05, "loss": 0.2531, "step": 515 }, { "epoch": 0.3475917817446952, "grad_norm": 0.4639384150505066, "learning_rate": 1.2372537740058382e-05, "loss": 0.2326, "step": 516 }, { "epoch": 0.34826540922869653, "grad_norm": 0.5909863710403442, "learning_rate": 1.2359106926476714e-05, "loss": 0.2824, "step": 517 }, { "epoch": 0.3489390367126979, "grad_norm": 0.5261175036430359, "learning_rate": 1.234564920414841e-05, "loss": 0.2757, "step": 518 }, { "epoch": 0.34961266419669923, "grad_norm": 0.577748715877533, "learning_rate": 1.2332164647599599e-05, "loss": 0.2619, "step": 519 }, { "epoch": 0.3502862916807006, "grad_norm": 0.5614107251167297, "learning_rate": 1.2318653331505015e-05, "loss": 0.2928, "step": 520 }, { "epoch": 0.35095991916470193, "grad_norm": 0.5660324692726135, "learning_rate": 1.2305115330687585e-05, "loss": 0.2797, "step": 521 }, { "epoch": 0.3516335466487033, "grad_norm": 0.5362821817398071, "learning_rate": 1.2291550720117997e-05, "loss": 0.2931, "step": 522 }, { "epoch": 0.3523071741327046, "grad_norm": 0.5424318909645081, "learning_rate": 1.2277959574914317e-05, "loss": 0.2709, "step": 523 }, { "epoch": 0.352980801616706, "grad_norm": 0.5283873081207275, "learning_rate": 1.226434197034154e-05, "loss": 0.2478, "step": 524 }, { "epoch": 0.3536544291007073, "grad_norm": 0.5451403260231018, "learning_rate": 1.2250697981811195e-05, "loss": 0.2684, "step": 525 }, { "epoch": 0.3543280565847087, "grad_norm": 0.5320309400558472, "learning_rate": 1.2237027684880914e-05, "loss": 0.2678, "step": 526 }, { "epoch": 0.35500168406871, "grad_norm": 0.558335542678833, "learning_rate": 1.2223331155254026e-05, "loss": 0.2715, "step": 527 }, { "epoch": 0.35567531155271137, "grad_norm": 0.5011473298072815, "learning_rate": 1.220960846877913e-05, "loss": 0.2535, "step": 528 }, { "epoch": 0.3563489390367127, "grad_norm": 0.5432257056236267, "learning_rate": 1.2195859701449672e-05, "loss": 0.2802, "step": 529 }, { "epoch": 0.35702256652071407, "grad_norm": 0.5836246013641357, "learning_rate": 1.2182084929403531e-05, "loss": 0.3088, "step": 530 }, { "epoch": 0.3576961940047154, "grad_norm": 0.5858445167541504, "learning_rate": 1.2168284228922597e-05, "loss": 0.2751, "step": 531 }, { "epoch": 0.35836982148871677, "grad_norm": 0.556725800037384, "learning_rate": 1.2154457676432344e-05, "loss": 0.2693, "step": 532 }, { "epoch": 0.35904344897271806, "grad_norm": 0.5822067260742188, "learning_rate": 1.2140605348501409e-05, "loss": 0.3145, "step": 533 }, { "epoch": 0.3597170764567194, "grad_norm": 0.5754439830780029, "learning_rate": 1.212672732184117e-05, "loss": 0.3009, "step": 534 }, { "epoch": 0.36039070394072076, "grad_norm": 0.5826534032821655, "learning_rate": 1.2112823673305317e-05, "loss": 0.3112, "step": 535 }, { "epoch": 0.3610643314247221, "grad_norm": 0.5259435176849365, "learning_rate": 1.209889447988943e-05, "loss": 0.2572, "step": 536 }, { "epoch": 0.36173795890872346, "grad_norm": 0.5303089022636414, "learning_rate": 1.2084939818730554e-05, "loss": 0.2745, "step": 537 }, { "epoch": 0.3624115863927248, "grad_norm": 0.4945959150791168, "learning_rate": 1.2070959767106762e-05, "loss": 0.2624, "step": 538 }, { "epoch": 0.36308521387672615, "grad_norm": 0.5212944149971008, "learning_rate": 1.2056954402436743e-05, "loss": 0.2367, "step": 539 }, { "epoch": 0.3637588413607275, "grad_norm": 0.5474100708961487, "learning_rate": 1.2042923802279356e-05, "loss": 0.2922, "step": 540 }, { "epoch": 0.36443246884472885, "grad_norm": 0.5586138963699341, "learning_rate": 1.2028868044333218e-05, "loss": 0.2779, "step": 541 }, { "epoch": 0.3651060963287302, "grad_norm": 0.4587612450122833, "learning_rate": 1.2014787206436256e-05, "loss": 0.2291, "step": 542 }, { "epoch": 0.36577972381273155, "grad_norm": 0.5979660749435425, "learning_rate": 1.200068136656529e-05, "loss": 0.2663, "step": 543 }, { "epoch": 0.3664533512967329, "grad_norm": 0.5004269480705261, "learning_rate": 1.1986550602835595e-05, "loss": 0.2325, "step": 544 }, { "epoch": 0.36712697878073425, "grad_norm": 0.5056456327438354, "learning_rate": 1.1972394993500466e-05, "loss": 0.2691, "step": 545 }, { "epoch": 0.3678006062647356, "grad_norm": 0.5447576642036438, "learning_rate": 1.1958214616950794e-05, "loss": 0.272, "step": 546 }, { "epoch": 0.36847423374873695, "grad_norm": 0.5720804929733276, "learning_rate": 1.1944009551714623e-05, "loss": 0.2651, "step": 547 }, { "epoch": 0.3691478612327383, "grad_norm": 0.5342965722084045, "learning_rate": 1.1929779876456713e-05, "loss": 0.2681, "step": 548 }, { "epoch": 0.36982148871673964, "grad_norm": 0.5355931520462036, "learning_rate": 1.191552566997812e-05, "loss": 0.2504, "step": 549 }, { "epoch": 0.370495116200741, "grad_norm": 0.6217589378356934, "learning_rate": 1.1901247011215733e-05, "loss": 0.2704, "step": 550 }, { "epoch": 0.37116874368474234, "grad_norm": 0.6108464002609253, "learning_rate": 1.1886943979241874e-05, "loss": 0.2995, "step": 551 }, { "epoch": 0.3718423711687437, "grad_norm": 0.5349010229110718, "learning_rate": 1.187261665326382e-05, "loss": 0.2873, "step": 552 }, { "epoch": 0.37251599865274504, "grad_norm": 0.5306320786476135, "learning_rate": 1.1858265112623388e-05, "loss": 0.2546, "step": 553 }, { "epoch": 0.3731896261367464, "grad_norm": 0.5984854102134705, "learning_rate": 1.18438894367965e-05, "loss": 0.3019, "step": 554 }, { "epoch": 0.37386325362074774, "grad_norm": 0.5498750805854797, "learning_rate": 1.1829489705392727e-05, "loss": 0.2702, "step": 555 }, { "epoch": 0.3745368811047491, "grad_norm": 0.5973288416862488, "learning_rate": 1.1815065998154849e-05, "loss": 0.2947, "step": 556 }, { "epoch": 0.37521050858875044, "grad_norm": 0.5865532755851746, "learning_rate": 1.180061839495843e-05, "loss": 0.3207, "step": 557 }, { "epoch": 0.3758841360727518, "grad_norm": 0.5075846314430237, "learning_rate": 1.1786146975811359e-05, "loss": 0.2474, "step": 558 }, { "epoch": 0.37655776355675313, "grad_norm": 0.5501227378845215, "learning_rate": 1.1771651820853417e-05, "loss": 0.274, "step": 559 }, { "epoch": 0.3772313910407545, "grad_norm": 0.5292581915855408, "learning_rate": 1.1757133010355821e-05, "loss": 0.2546, "step": 560 }, { "epoch": 0.37790501852475583, "grad_norm": 0.5926501750946045, "learning_rate": 1.1742590624720796e-05, "loss": 0.2847, "step": 561 }, { "epoch": 0.3785786460087572, "grad_norm": 0.5264430046081543, "learning_rate": 1.1728024744481117e-05, "loss": 0.2634, "step": 562 }, { "epoch": 0.37925227349275853, "grad_norm": 0.5014563798904419, "learning_rate": 1.171343545029967e-05, "loss": 0.2301, "step": 563 }, { "epoch": 0.3799259009767599, "grad_norm": 0.48584073781967163, "learning_rate": 1.1698822822969001e-05, "loss": 0.2482, "step": 564 }, { "epoch": 0.38059952846076117, "grad_norm": 0.5884197354316711, "learning_rate": 1.1684186943410867e-05, "loss": 0.286, "step": 565 }, { "epoch": 0.3812731559447625, "grad_norm": 0.556430459022522, "learning_rate": 1.16695278926758e-05, "loss": 0.2496, "step": 566 }, { "epoch": 0.38194678342876387, "grad_norm": 0.5392268300056458, "learning_rate": 1.165484575194264e-05, "loss": 0.2786, "step": 567 }, { "epoch": 0.3826204109127652, "grad_norm": 0.5491148233413696, "learning_rate": 1.1640140602518102e-05, "loss": 0.2289, "step": 568 }, { "epoch": 0.38329403839676657, "grad_norm": 0.5565954446792603, "learning_rate": 1.162541252583631e-05, "loss": 0.2614, "step": 569 }, { "epoch": 0.3839676658807679, "grad_norm": 0.5307971239089966, "learning_rate": 1.1610661603458363e-05, "loss": 0.2577, "step": 570 }, { "epoch": 0.38464129336476927, "grad_norm": 0.5446802377700806, "learning_rate": 1.159588791707187e-05, "loss": 0.292, "step": 571 }, { "epoch": 0.3853149208487706, "grad_norm": 0.5837084054946899, "learning_rate": 1.1581091548490505e-05, "loss": 0.2771, "step": 572 }, { "epoch": 0.38598854833277196, "grad_norm": 0.5611515045166016, "learning_rate": 1.156627257965355e-05, "loss": 0.2602, "step": 573 }, { "epoch": 0.3866621758167733, "grad_norm": 0.5338358879089355, "learning_rate": 1.155143109262544e-05, "loss": 0.2573, "step": 574 }, { "epoch": 0.38733580330077466, "grad_norm": 0.4791894853115082, "learning_rate": 1.1536567169595316e-05, "loss": 0.2411, "step": 575 }, { "epoch": 0.388009430784776, "grad_norm": 0.5701311826705933, "learning_rate": 1.1521680892876563e-05, "loss": 0.2973, "step": 576 }, { "epoch": 0.38868305826877736, "grad_norm": 0.4976153075695038, "learning_rate": 1.1506772344906356e-05, "loss": 0.2716, "step": 577 }, { "epoch": 0.3893566857527787, "grad_norm": 0.5492983460426331, "learning_rate": 1.1491841608245204e-05, "loss": 0.2621, "step": 578 }, { "epoch": 0.39003031323678006, "grad_norm": 0.5490813255310059, "learning_rate": 1.1476888765576493e-05, "loss": 0.2687, "step": 579 }, { "epoch": 0.3907039407207814, "grad_norm": 0.5402075052261353, "learning_rate": 1.1461913899706025e-05, "loss": 0.3112, "step": 580 }, { "epoch": 0.39137756820478276, "grad_norm": 0.5017600059509277, "learning_rate": 1.1446917093561564e-05, "loss": 0.2242, "step": 581 }, { "epoch": 0.3920511956887841, "grad_norm": 0.5590758919715881, "learning_rate": 1.1431898430192375e-05, "loss": 0.2569, "step": 582 }, { "epoch": 0.39272482317278545, "grad_norm": 0.5497624278068542, "learning_rate": 1.1416857992768764e-05, "loss": 0.3114, "step": 583 }, { "epoch": 0.3933984506567868, "grad_norm": 0.5833696126937866, "learning_rate": 1.1401795864581616e-05, "loss": 0.2999, "step": 584 }, { "epoch": 0.39407207814078815, "grad_norm": 0.5114924907684326, "learning_rate": 1.1386712129041937e-05, "loss": 0.2428, "step": 585 }, { "epoch": 0.3947457056247895, "grad_norm": 0.5477609038352966, "learning_rate": 1.1371606869680388e-05, "loss": 0.2722, "step": 586 }, { "epoch": 0.39541933310879085, "grad_norm": 0.5121515393257141, "learning_rate": 1.1356480170146826e-05, "loss": 0.2376, "step": 587 }, { "epoch": 0.3960929605927922, "grad_norm": 0.502560019493103, "learning_rate": 1.1341332114209838e-05, "loss": 0.2737, "step": 588 }, { "epoch": 0.39676658807679355, "grad_norm": 0.5239719748497009, "learning_rate": 1.1326162785756281e-05, "loss": 0.2563, "step": 589 }, { "epoch": 0.3974402155607949, "grad_norm": 0.5645294189453125, "learning_rate": 1.131097226879081e-05, "loss": 0.308, "step": 590 }, { "epoch": 0.39811384304479625, "grad_norm": 0.5425258278846741, "learning_rate": 1.1295760647435424e-05, "loss": 0.2388, "step": 591 }, { "epoch": 0.3987874705287976, "grad_norm": 0.5374796390533447, "learning_rate": 1.1280528005928988e-05, "loss": 0.2774, "step": 592 }, { "epoch": 0.39946109801279894, "grad_norm": 0.5628758072853088, "learning_rate": 1.1265274428626775e-05, "loss": 0.2689, "step": 593 }, { "epoch": 0.4001347254968003, "grad_norm": 0.5226148366928101, "learning_rate": 1.125e-05, "loss": 0.2713, "step": 594 }, { "epoch": 0.40080835298080164, "grad_norm": 0.5630069971084595, "learning_rate": 1.1234704804635342e-05, "loss": 0.3279, "step": 595 }, { "epoch": 0.401481980464803, "grad_norm": 0.508704423904419, "learning_rate": 1.1219388927234482e-05, "loss": 0.2623, "step": 596 }, { "epoch": 0.40215560794880434, "grad_norm": 0.5345742702484131, "learning_rate": 1.1204052452613638e-05, "loss": 0.2865, "step": 597 }, { "epoch": 0.40282923543280563, "grad_norm": 0.5258358120918274, "learning_rate": 1.1188695465703092e-05, "loss": 0.2721, "step": 598 }, { "epoch": 0.403502862916807, "grad_norm": 0.5306556820869446, "learning_rate": 1.1173318051546713e-05, "loss": 0.2753, "step": 599 }, { "epoch": 0.40417649040080833, "grad_norm": 0.49859175086021423, "learning_rate": 1.1157920295301498e-05, "loss": 0.2594, "step": 600 }, { "epoch": 0.40417649040080833, "eval_loss": 0.2652011811733246, "eval_runtime": 105.8884, "eval_samples_per_second": 47.22, "eval_steps_per_second": 2.956, "step": 600 }, { "epoch": 0.4048501178848097, "grad_norm": 0.558407723903656, "learning_rate": 1.114250228223709e-05, "loss": 0.256, "step": 601 }, { "epoch": 0.40552374536881103, "grad_norm": 0.508040726184845, "learning_rate": 1.1127064097735315e-05, "loss": 0.2575, "step": 602 }, { "epoch": 0.4061973728528124, "grad_norm": 0.5474634766578674, "learning_rate": 1.1111605827289698e-05, "loss": 0.2805, "step": 603 }, { "epoch": 0.4068710003368137, "grad_norm": 0.519263505935669, "learning_rate": 1.1096127556505e-05, "loss": 0.2534, "step": 604 }, { "epoch": 0.4075446278208151, "grad_norm": 0.5802994966506958, "learning_rate": 1.1080629371096738e-05, "loss": 0.2756, "step": 605 }, { "epoch": 0.4082182553048164, "grad_norm": 0.5730322599411011, "learning_rate": 1.1065111356890712e-05, "loss": 0.2888, "step": 606 }, { "epoch": 0.4088918827888178, "grad_norm": 0.5447918176651001, "learning_rate": 1.1049573599822537e-05, "loss": 0.2848, "step": 607 }, { "epoch": 0.4095655102728191, "grad_norm": 0.5072281360626221, "learning_rate": 1.1034016185937149e-05, "loss": 0.2972, "step": 608 }, { "epoch": 0.41023913775682047, "grad_norm": 0.6098499298095703, "learning_rate": 1.1018439201388346e-05, "loss": 0.299, "step": 609 }, { "epoch": 0.4109127652408218, "grad_norm": 0.594445526599884, "learning_rate": 1.1002842732438301e-05, "loss": 0.2778, "step": 610 }, { "epoch": 0.41158639272482317, "grad_norm": 0.5406931638717651, "learning_rate": 1.0987226865457091e-05, "loss": 0.2948, "step": 611 }, { "epoch": 0.4122600202088245, "grad_norm": 0.5487210750579834, "learning_rate": 1.0971591686922211e-05, "loss": 0.256, "step": 612 }, { "epoch": 0.41293364769282587, "grad_norm": 0.5063245296478271, "learning_rate": 1.0955937283418104e-05, "loss": 0.2481, "step": 613 }, { "epoch": 0.4136072751768272, "grad_norm": 0.5232447981834412, "learning_rate": 1.0940263741635678e-05, "loss": 0.2436, "step": 614 }, { "epoch": 0.41428090266082856, "grad_norm": 0.5449836254119873, "learning_rate": 1.092457114837182e-05, "loss": 0.2621, "step": 615 }, { "epoch": 0.4149545301448299, "grad_norm": 0.5582854151725769, "learning_rate": 1.090885959052892e-05, "loss": 0.2885, "step": 616 }, { "epoch": 0.41562815762883126, "grad_norm": 0.5433541536331177, "learning_rate": 1.0893129155114396e-05, "loss": 0.2659, "step": 617 }, { "epoch": 0.4163017851128326, "grad_norm": 0.5937801599502563, "learning_rate": 1.0877379929240198e-05, "loss": 0.2968, "step": 618 }, { "epoch": 0.41697541259683396, "grad_norm": 0.4904331564903259, "learning_rate": 1.0861612000122341e-05, "loss": 0.2508, "step": 619 }, { "epoch": 0.4176490400808353, "grad_norm": 0.5370484590530396, "learning_rate": 1.0845825455080411e-05, "loss": 0.2564, "step": 620 }, { "epoch": 0.41832266756483666, "grad_norm": 0.535376250743866, "learning_rate": 1.0830020381537088e-05, "loss": 0.2796, "step": 621 }, { "epoch": 0.418996295048838, "grad_norm": 0.5508119463920593, "learning_rate": 1.0814196867017656e-05, "loss": 0.281, "step": 622 }, { "epoch": 0.41966992253283936, "grad_norm": 0.525283694267273, "learning_rate": 1.079835499914952e-05, "loss": 0.2306, "step": 623 }, { "epoch": 0.4203435500168407, "grad_norm": 0.5157189965248108, "learning_rate": 1.078249486566173e-05, "loss": 0.2679, "step": 624 }, { "epoch": 0.42101717750084205, "grad_norm": 0.6008614301681519, "learning_rate": 1.0766616554384477e-05, "loss": 0.2815, "step": 625 }, { "epoch": 0.4216908049848434, "grad_norm": 0.5147749185562134, "learning_rate": 1.0750720153248626e-05, "loss": 0.2587, "step": 626 }, { "epoch": 0.42236443246884475, "grad_norm": 0.5508129596710205, "learning_rate": 1.073480575028521e-05, "loss": 0.2788, "step": 627 }, { "epoch": 0.4230380599528461, "grad_norm": 0.5465036034584045, "learning_rate": 1.0718873433624966e-05, "loss": 0.2606, "step": 628 }, { "epoch": 0.42371168743684745, "grad_norm": 0.5761625170707703, "learning_rate": 1.070292329149782e-05, "loss": 0.3149, "step": 629 }, { "epoch": 0.42438531492084874, "grad_norm": 0.5194136500358582, "learning_rate": 1.0686955412232419e-05, "loss": 0.2305, "step": 630 }, { "epoch": 0.4250589424048501, "grad_norm": 0.5823161602020264, "learning_rate": 1.0670969884255636e-05, "loss": 0.2495, "step": 631 }, { "epoch": 0.42573256988885144, "grad_norm": 0.5550847053527832, "learning_rate": 1.0654966796092073e-05, "loss": 0.2539, "step": 632 }, { "epoch": 0.4264061973728528, "grad_norm": 0.5327949523925781, "learning_rate": 1.0638946236363578e-05, "loss": 0.2655, "step": 633 }, { "epoch": 0.42707982485685414, "grad_norm": 0.5146956443786621, "learning_rate": 1.0622908293788758e-05, "loss": 0.2599, "step": 634 }, { "epoch": 0.4277534523408555, "grad_norm": 0.5790160894393921, "learning_rate": 1.0606853057182481e-05, "loss": 0.298, "step": 635 }, { "epoch": 0.42842707982485684, "grad_norm": 0.5627730488777161, "learning_rate": 1.059078061545538e-05, "loss": 0.2622, "step": 636 }, { "epoch": 0.4291007073088582, "grad_norm": 0.619365394115448, "learning_rate": 1.0574691057613376e-05, "loss": 0.2905, "step": 637 }, { "epoch": 0.42977433479285954, "grad_norm": 0.5521032810211182, "learning_rate": 1.0558584472757167e-05, "loss": 0.2705, "step": 638 }, { "epoch": 0.4304479622768609, "grad_norm": 0.5045711398124695, "learning_rate": 1.0542460950081747e-05, "loss": 0.2289, "step": 639 }, { "epoch": 0.43112158976086223, "grad_norm": 0.5129411816596985, "learning_rate": 1.0526320578875909e-05, "loss": 0.2572, "step": 640 }, { "epoch": 0.4317952172448636, "grad_norm": 0.5294272899627686, "learning_rate": 1.0510163448521747e-05, "loss": 0.2702, "step": 641 }, { "epoch": 0.43246884472886493, "grad_norm": 0.5448393225669861, "learning_rate": 1.0493989648494165e-05, "loss": 0.2808, "step": 642 }, { "epoch": 0.4331424722128663, "grad_norm": 0.5107436776161194, "learning_rate": 1.0477799268360384e-05, "loss": 0.248, "step": 643 }, { "epoch": 0.43381609969686763, "grad_norm": 0.5598347187042236, "learning_rate": 1.0461592397779435e-05, "loss": 0.2342, "step": 644 }, { "epoch": 0.434489727180869, "grad_norm": 0.5707139372825623, "learning_rate": 1.0445369126501676e-05, "loss": 0.2764, "step": 645 }, { "epoch": 0.4351633546648703, "grad_norm": 0.48345211148262024, "learning_rate": 1.0429129544368283e-05, "loss": 0.2215, "step": 646 }, { "epoch": 0.4358369821488717, "grad_norm": 0.5131022930145264, "learning_rate": 1.0412873741310763e-05, "loss": 0.2423, "step": 647 }, { "epoch": 0.436510609632873, "grad_norm": 0.5428949594497681, "learning_rate": 1.0396601807350452e-05, "loss": 0.2331, "step": 648 }, { "epoch": 0.4371842371168744, "grad_norm": 0.47753867506980896, "learning_rate": 1.038031383259801e-05, "loss": 0.2552, "step": 649 }, { "epoch": 0.4378578646008757, "grad_norm": 0.48779332637786865, "learning_rate": 1.0364009907252937e-05, "loss": 0.2499, "step": 650 }, { "epoch": 0.4385314920848771, "grad_norm": 0.4910006523132324, "learning_rate": 1.0347690121603047e-05, "loss": 0.2498, "step": 651 }, { "epoch": 0.4392051195688784, "grad_norm": 0.5575456023216248, "learning_rate": 1.0331354566024005e-05, "loss": 0.2503, "step": 652 }, { "epoch": 0.43987874705287977, "grad_norm": 0.5806515216827393, "learning_rate": 1.0315003330978799e-05, "loss": 0.254, "step": 653 }, { "epoch": 0.4405523745368811, "grad_norm": 0.5564923882484436, "learning_rate": 1.0298636507017241e-05, "loss": 0.2804, "step": 654 }, { "epoch": 0.44122600202088247, "grad_norm": 0.5716164708137512, "learning_rate": 1.0282254184775473e-05, "loss": 0.2844, "step": 655 }, { "epoch": 0.4418996295048838, "grad_norm": 0.5606719255447388, "learning_rate": 1.0265856454975473e-05, "loss": 0.2576, "step": 656 }, { "epoch": 0.44257325698888517, "grad_norm": 0.5467285513877869, "learning_rate": 1.0249443408424535e-05, "loss": 0.2782, "step": 657 }, { "epoch": 0.4432468844728865, "grad_norm": 0.569665253162384, "learning_rate": 1.0233015136014773e-05, "loss": 0.272, "step": 658 }, { "epoch": 0.44392051195688786, "grad_norm": 0.5965842604637146, "learning_rate": 1.021657172872262e-05, "loss": 0.3023, "step": 659 }, { "epoch": 0.4445941394408892, "grad_norm": 0.5759636163711548, "learning_rate": 1.0200113277608326e-05, "loss": 0.2621, "step": 660 }, { "epoch": 0.44526776692489056, "grad_norm": 0.5999960899353027, "learning_rate": 1.0183639873815448e-05, "loss": 0.2976, "step": 661 }, { "epoch": 0.44594139440889186, "grad_norm": 0.5440315008163452, "learning_rate": 1.0167151608570346e-05, "loss": 0.2889, "step": 662 }, { "epoch": 0.4466150218928932, "grad_norm": 0.4932374358177185, "learning_rate": 1.0150648573181685e-05, "loss": 0.2271, "step": 663 }, { "epoch": 0.44728864937689455, "grad_norm": 0.5871284604072571, "learning_rate": 1.0134130859039921e-05, "loss": 0.3202, "step": 664 }, { "epoch": 0.4479622768608959, "grad_norm": 0.5287674069404602, "learning_rate": 1.0117598557616796e-05, "loss": 0.2486, "step": 665 }, { "epoch": 0.44863590434489725, "grad_norm": 0.588444709777832, "learning_rate": 1.0101051760464837e-05, "loss": 0.2555, "step": 666 }, { "epoch": 0.4493095318288986, "grad_norm": 0.5376453399658203, "learning_rate": 1.0084490559216843e-05, "loss": 0.2506, "step": 667 }, { "epoch": 0.44998315931289995, "grad_norm": 0.5496957898139954, "learning_rate": 1.006791504558538e-05, "loss": 0.2616, "step": 668 }, { "epoch": 0.4506567867969013, "grad_norm": 0.523008406162262, "learning_rate": 1.0051325311362278e-05, "loss": 0.2597, "step": 669 }, { "epoch": 0.45133041428090265, "grad_norm": 0.5686816573143005, "learning_rate": 1.0034721448418105e-05, "loss": 0.2665, "step": 670 }, { "epoch": 0.452004041764904, "grad_norm": 0.5065593719482422, "learning_rate": 1.0018103548701688e-05, "loss": 0.2566, "step": 671 }, { "epoch": 0.45267766924890535, "grad_norm": 0.5687103867530823, "learning_rate": 1.0001471704239577e-05, "loss": 0.2628, "step": 672 }, { "epoch": 0.4533512967329067, "grad_norm": 0.5782075524330139, "learning_rate": 9.984826007135544e-06, "loss": 0.2732, "step": 673 }, { "epoch": 0.45402492421690804, "grad_norm": 0.5679803490638733, "learning_rate": 9.968166549570075e-06, "loss": 0.2664, "step": 674 }, { "epoch": 0.4546985517009094, "grad_norm": 0.5293748378753662, "learning_rate": 9.951493423799866e-06, "loss": 0.2498, "step": 675 }, { "epoch": 0.45537217918491074, "grad_norm": 0.5444015264511108, "learning_rate": 9.934806722157294e-06, "loss": 0.2549, "step": 676 }, { "epoch": 0.4560458066689121, "grad_norm": 0.5367648601531982, "learning_rate": 9.918106537049921e-06, "loss": 0.2623, "step": 677 }, { "epoch": 0.45671943415291344, "grad_norm": 0.5820662975311279, "learning_rate": 9.901392960959983e-06, "loss": 0.2771, "step": 678 }, { "epoch": 0.4573930616369148, "grad_norm": 0.5573861598968506, "learning_rate": 9.884666086443862e-06, "loss": 0.2614, "step": 679 }, { "epoch": 0.45806668912091614, "grad_norm": 0.6296043992042542, "learning_rate": 9.867926006131597e-06, "loss": 0.3102, "step": 680 }, { "epoch": 0.4587403166049175, "grad_norm": 0.5795363187789917, "learning_rate": 9.851172812726344e-06, "loss": 0.3059, "step": 681 }, { "epoch": 0.45941394408891884, "grad_norm": 0.48046785593032837, "learning_rate": 9.834406599003885e-06, "loss": 0.2323, "step": 682 }, { "epoch": 0.4600875715729202, "grad_norm": 0.4878872036933899, "learning_rate": 9.817627457812105e-06, "loss": 0.2467, "step": 683 }, { "epoch": 0.46076119905692153, "grad_norm": 0.5333375334739685, "learning_rate": 9.800835482070479e-06, "loss": 0.2282, "step": 684 }, { "epoch": 0.4614348265409229, "grad_norm": 0.543725848197937, "learning_rate": 9.784030764769553e-06, "loss": 0.2427, "step": 685 }, { "epoch": 0.46210845402492423, "grad_norm": 0.5145445466041565, "learning_rate": 9.76721339897044e-06, "loss": 0.2291, "step": 686 }, { "epoch": 0.4627820815089256, "grad_norm": 0.5099066495895386, "learning_rate": 9.75038347780429e-06, "loss": 0.245, "step": 687 }, { "epoch": 0.46345570899292693, "grad_norm": 0.5599386096000671, "learning_rate": 9.73354109447179e-06, "loss": 0.2994, "step": 688 }, { "epoch": 0.4641293364769283, "grad_norm": 0.5298258662223816, "learning_rate": 9.716686342242632e-06, "loss": 0.231, "step": 689 }, { "epoch": 0.4648029639609296, "grad_norm": 0.5349884033203125, "learning_rate": 9.69981931445501e-06, "loss": 0.2436, "step": 690 }, { "epoch": 0.465476591444931, "grad_norm": 0.5078858137130737, "learning_rate": 9.682940104515097e-06, "loss": 0.2735, "step": 691 }, { "epoch": 0.4661502189289323, "grad_norm": 0.5433405637741089, "learning_rate": 9.666048805896524e-06, "loss": 0.2472, "step": 692 }, { "epoch": 0.4668238464129337, "grad_norm": 0.5337989926338196, "learning_rate": 9.649145512139876e-06, "loss": 0.2815, "step": 693 }, { "epoch": 0.46749747389693497, "grad_norm": 0.491817831993103, "learning_rate": 9.632230316852153e-06, "loss": 0.2712, "step": 694 }, { "epoch": 0.4681711013809363, "grad_norm": 0.5814330577850342, "learning_rate": 9.615303313706271e-06, "loss": 0.2931, "step": 695 }, { "epoch": 0.46884472886493767, "grad_norm": 0.5358330607414246, "learning_rate": 9.598364596440534e-06, "loss": 0.2546, "step": 696 }, { "epoch": 0.469518356348939, "grad_norm": 0.5111145377159119, "learning_rate": 9.581414258858116e-06, "loss": 0.2607, "step": 697 }, { "epoch": 0.47019198383294036, "grad_norm": 0.5266521573066711, "learning_rate": 9.564452394826538e-06, "loss": 0.2554, "step": 698 }, { "epoch": 0.4708656113169417, "grad_norm": 0.5091780424118042, "learning_rate": 9.54747909827716e-06, "loss": 0.2723, "step": 699 }, { "epoch": 0.47153923880094306, "grad_norm": 0.5414915680885315, "learning_rate": 9.530494463204646e-06, "loss": 0.2577, "step": 700 }, { "epoch": 0.47153923880094306, "eval_loss": 0.26179420948028564, "eval_runtime": 105.0708, "eval_samples_per_second": 47.587, "eval_steps_per_second": 2.979, "step": 700 }, { "epoch": 0.4722128662849444, "grad_norm": 0.505789577960968, "learning_rate": 9.513498583666456e-06, "loss": 0.2448, "step": 701 }, { "epoch": 0.47288649376894576, "grad_norm": 0.46454617381095886, "learning_rate": 9.496491553782314e-06, "loss": 0.221, "step": 702 }, { "epoch": 0.4735601212529471, "grad_norm": 0.5358849763870239, "learning_rate": 9.479473467733697e-06, "loss": 0.2872, "step": 703 }, { "epoch": 0.47423374873694846, "grad_norm": 0.5496987700462341, "learning_rate": 9.462444419763306e-06, "loss": 0.2464, "step": 704 }, { "epoch": 0.4749073762209498, "grad_norm": 0.5485591292381287, "learning_rate": 9.445404504174546e-06, "loss": 0.2695, "step": 705 }, { "epoch": 0.47558100370495116, "grad_norm": 0.5437228679656982, "learning_rate": 9.42835381533101e-06, "loss": 0.2823, "step": 706 }, { "epoch": 0.4762546311889525, "grad_norm": 0.5094515085220337, "learning_rate": 9.411292447655948e-06, "loss": 0.2401, "step": 707 }, { "epoch": 0.47692825867295385, "grad_norm": 0.5395442843437195, "learning_rate": 9.394220495631744e-06, "loss": 0.2659, "step": 708 }, { "epoch": 0.4776018861569552, "grad_norm": 0.4930800795555115, "learning_rate": 9.377138053799399e-06, "loss": 0.2383, "step": 709 }, { "epoch": 0.47827551364095655, "grad_norm": 0.5237337350845337, "learning_rate": 9.360045216758008e-06, "loss": 0.2527, "step": 710 }, { "epoch": 0.4789491411249579, "grad_norm": 0.5243161916732788, "learning_rate": 9.342942079164223e-06, "loss": 0.2515, "step": 711 }, { "epoch": 0.47962276860895925, "grad_norm": 0.5414012670516968, "learning_rate": 9.325828735731747e-06, "loss": 0.275, "step": 712 }, { "epoch": 0.4802963960929606, "grad_norm": 0.547073245048523, "learning_rate": 9.308705281230796e-06, "loss": 0.276, "step": 713 }, { "epoch": 0.48097002357696195, "grad_norm": 0.49008458852767944, "learning_rate": 9.291571810487584e-06, "loss": 0.246, "step": 714 }, { "epoch": 0.4816436510609633, "grad_norm": 0.5415433645248413, "learning_rate": 9.27442841838379e-06, "loss": 0.2658, "step": 715 }, { "epoch": 0.48231727854496464, "grad_norm": 0.5856931209564209, "learning_rate": 9.257275199856032e-06, "loss": 0.2675, "step": 716 }, { "epoch": 0.482990906028966, "grad_norm": 0.5154370665550232, "learning_rate": 9.24011224989535e-06, "loss": 0.2422, "step": 717 }, { "epoch": 0.48366453351296734, "grad_norm": 0.5306107401847839, "learning_rate": 9.222939663546677e-06, "loss": 0.2687, "step": 718 }, { "epoch": 0.4843381609969687, "grad_norm": 0.4880635142326355, "learning_rate": 9.2057575359083e-06, "loss": 0.2276, "step": 719 }, { "epoch": 0.48501178848097004, "grad_norm": 0.6055603623390198, "learning_rate": 9.18856596213135e-06, "loss": 0.2907, "step": 720 }, { "epoch": 0.4856854159649714, "grad_norm": 0.5602757930755615, "learning_rate": 9.171365037419272e-06, "loss": 0.2511, "step": 721 }, { "epoch": 0.48635904344897274, "grad_norm": 0.5492405295372009, "learning_rate": 9.15415485702729e-06, "loss": 0.246, "step": 722 }, { "epoch": 0.4870326709329741, "grad_norm": 0.6091371178627014, "learning_rate": 9.136935516261887e-06, "loss": 0.3003, "step": 723 }, { "epoch": 0.48770629841697544, "grad_norm": 0.5400590300559998, "learning_rate": 9.119707110480272e-06, "loss": 0.2576, "step": 724 }, { "epoch": 0.4883799259009768, "grad_norm": 0.5183984041213989, "learning_rate": 9.10246973508985e-06, "loss": 0.2519, "step": 725 }, { "epoch": 0.48905355338497813, "grad_norm": 0.5791885256767273, "learning_rate": 9.08522348554771e-06, "loss": 0.269, "step": 726 }, { "epoch": 0.48972718086897943, "grad_norm": 0.5196906328201294, "learning_rate": 9.067968457360073e-06, "loss": 0.2681, "step": 727 }, { "epoch": 0.4904008083529808, "grad_norm": 0.5393977165222168, "learning_rate": 9.050704746081779e-06, "loss": 0.2487, "step": 728 }, { "epoch": 0.4910744358369821, "grad_norm": 0.5441868305206299, "learning_rate": 9.033432447315751e-06, "loss": 0.2603, "step": 729 }, { "epoch": 0.4917480633209835, "grad_norm": 0.4999203383922577, "learning_rate": 9.016151656712473e-06, "loss": 0.2569, "step": 730 }, { "epoch": 0.4924216908049848, "grad_norm": 0.5059922933578491, "learning_rate": 8.998862469969452e-06, "loss": 0.2428, "step": 731 }, { "epoch": 0.4930953182889862, "grad_norm": 0.5794141292572021, "learning_rate": 8.981564982830683e-06, "loss": 0.2901, "step": 732 }, { "epoch": 0.4937689457729875, "grad_norm": 0.5344904065132141, "learning_rate": 8.964259291086141e-06, "loss": 0.278, "step": 733 }, { "epoch": 0.49444257325698887, "grad_norm": 0.5577378273010254, "learning_rate": 8.946945490571227e-06, "loss": 0.2753, "step": 734 }, { "epoch": 0.4951162007409902, "grad_norm": 0.48888590931892395, "learning_rate": 8.92962367716625e-06, "loss": 0.2565, "step": 735 }, { "epoch": 0.49578982822499157, "grad_norm": 0.5605798363685608, "learning_rate": 8.912293946795895e-06, "loss": 0.274, "step": 736 }, { "epoch": 0.4964634557089929, "grad_norm": 0.5351974964141846, "learning_rate": 8.894956395428685e-06, "loss": 0.259, "step": 737 }, { "epoch": 0.49713708319299427, "grad_norm": 0.530037522315979, "learning_rate": 8.877611119076454e-06, "loss": 0.2468, "step": 738 }, { "epoch": 0.4978107106769956, "grad_norm": 0.5955355763435364, "learning_rate": 8.860258213793819e-06, "loss": 0.2702, "step": 739 }, { "epoch": 0.49848433816099696, "grad_norm": 0.5594556927680969, "learning_rate": 8.842897775677645e-06, "loss": 0.2796, "step": 740 }, { "epoch": 0.4991579656449983, "grad_norm": 0.5318235158920288, "learning_rate": 8.825529900866507e-06, "loss": 0.2721, "step": 741 }, { "epoch": 0.49983159312899966, "grad_norm": 0.6066297888755798, "learning_rate": 8.808154685540164e-06, "loss": 0.2814, "step": 742 }, { "epoch": 0.500505220613001, "grad_norm": 0.520949125289917, "learning_rate": 8.790772225919031e-06, "loss": 0.2479, "step": 743 }, { "epoch": 0.5011788480970023, "grad_norm": 0.532832682132721, "learning_rate": 8.77338261826364e-06, "loss": 0.2717, "step": 744 }, { "epoch": 0.5018524755810037, "grad_norm": 0.4917290210723877, "learning_rate": 8.755985958874096e-06, "loss": 0.2331, "step": 745 }, { "epoch": 0.502526103065005, "grad_norm": 0.6336959004402161, "learning_rate": 8.73858234408957e-06, "loss": 0.3059, "step": 746 }, { "epoch": 0.5031997305490064, "grad_norm": 0.5722649693489075, "learning_rate": 8.72117187028774e-06, "loss": 0.2682, "step": 747 }, { "epoch": 0.5038733580330077, "grad_norm": 0.47712576389312744, "learning_rate": 8.70375463388427e-06, "loss": 0.2468, "step": 748 }, { "epoch": 0.504546985517009, "grad_norm": 0.49866771697998047, "learning_rate": 8.68633073133228e-06, "loss": 0.2609, "step": 749 }, { "epoch": 0.5052206130010104, "grad_norm": 0.5410306453704834, "learning_rate": 8.6689002591218e-06, "loss": 0.2733, "step": 750 }, { "epoch": 0.5058942404850117, "grad_norm": 0.5518447160720825, "learning_rate": 8.651463313779241e-06, "loss": 0.2525, "step": 751 }, { "epoch": 0.5065678679690131, "grad_norm": 0.5311466455459595, "learning_rate": 8.634019991866863e-06, "loss": 0.275, "step": 752 }, { "epoch": 0.5072414954530144, "grad_norm": 0.5381631255149841, "learning_rate": 8.61657038998224e-06, "loss": 0.275, "step": 753 }, { "epoch": 0.5079151229370158, "grad_norm": 0.48526835441589355, "learning_rate": 8.599114604757716e-06, "loss": 0.2431, "step": 754 }, { "epoch": 0.5085887504210171, "grad_norm": 0.5347431302070618, "learning_rate": 8.581652732859887e-06, "loss": 0.2731, "step": 755 }, { "epoch": 0.5092623779050185, "grad_norm": 0.5098583102226257, "learning_rate": 8.56418487098905e-06, "loss": 0.294, "step": 756 }, { "epoch": 0.5099360053890198, "grad_norm": 0.499496191740036, "learning_rate": 8.54671111587867e-06, "loss": 0.2294, "step": 757 }, { "epoch": 0.5106096328730212, "grad_norm": 0.5586072206497192, "learning_rate": 8.529231564294858e-06, "loss": 0.2506, "step": 758 }, { "epoch": 0.5112832603570225, "grad_norm": 0.5203363299369812, "learning_rate": 8.51174631303581e-06, "loss": 0.2505, "step": 759 }, { "epoch": 0.5119568878410239, "grad_norm": 0.5142697095870972, "learning_rate": 8.494255458931304e-06, "loss": 0.2456, "step": 760 }, { "epoch": 0.5126305153250252, "grad_norm": 0.4652908444404602, "learning_rate": 8.476759098842129e-06, "loss": 0.2085, "step": 761 }, { "epoch": 0.5133041428090266, "grad_norm": 0.5014703273773193, "learning_rate": 8.459257329659571e-06, "loss": 0.239, "step": 762 }, { "epoch": 0.5139777702930279, "grad_norm": 0.5147262215614319, "learning_rate": 8.441750248304872e-06, "loss": 0.2727, "step": 763 }, { "epoch": 0.5146513977770293, "grad_norm": 0.564335823059082, "learning_rate": 8.424237951728689e-06, "loss": 0.2983, "step": 764 }, { "epoch": 0.5153250252610306, "grad_norm": 0.5217107534408569, "learning_rate": 8.406720536910568e-06, "loss": 0.238, "step": 765 }, { "epoch": 0.515998652745032, "grad_norm": 0.529780924320221, "learning_rate": 8.389198100858385e-06, "loss": 0.271, "step": 766 }, { "epoch": 0.5166722802290333, "grad_norm": 0.5005664229393005, "learning_rate": 8.371670740607833e-06, "loss": 0.265, "step": 767 }, { "epoch": 0.5173459077130347, "grad_norm": 0.4695169925689697, "learning_rate": 8.354138553221869e-06, "loss": 0.225, "step": 768 }, { "epoch": 0.518019535197036, "grad_norm": 0.6260945200920105, "learning_rate": 8.336601635790184e-06, "loss": 0.2725, "step": 769 }, { "epoch": 0.5186931626810374, "grad_norm": 0.5363501310348511, "learning_rate": 8.319060085428664e-06, "loss": 0.2631, "step": 770 }, { "epoch": 0.5193667901650387, "grad_norm": 0.5340143442153931, "learning_rate": 8.301513999278851e-06, "loss": 0.2829, "step": 771 }, { "epoch": 0.5200404176490401, "grad_norm": 0.5355620384216309, "learning_rate": 8.283963474507402e-06, "loss": 0.2675, "step": 772 }, { "epoch": 0.5207140451330414, "grad_norm": 0.5030906796455383, "learning_rate": 8.266408608305555e-06, "loss": 0.2243, "step": 773 }, { "epoch": 0.5213876726170428, "grad_norm": 0.5517938137054443, "learning_rate": 8.248849497888598e-06, "loss": 0.2554, "step": 774 }, { "epoch": 0.5220613001010441, "grad_norm": 0.47788354754447937, "learning_rate": 8.231286240495305e-06, "loss": 0.2258, "step": 775 }, { "epoch": 0.5227349275850455, "grad_norm": 0.550268828868866, "learning_rate": 8.213718933387438e-06, "loss": 0.2586, "step": 776 }, { "epoch": 0.5234085550690468, "grad_norm": 0.5247451066970825, "learning_rate": 8.196147673849165e-06, "loss": 0.2491, "step": 777 }, { "epoch": 0.5240821825530482, "grad_norm": 0.49666067957878113, "learning_rate": 8.17857255918655e-06, "loss": 0.2501, "step": 778 }, { "epoch": 0.5247558100370495, "grad_norm": 0.5575336217880249, "learning_rate": 8.160993686727015e-06, "loss": 0.3047, "step": 779 }, { "epoch": 0.5254294375210509, "grad_norm": 0.5327598452568054, "learning_rate": 8.143411153818773e-06, "loss": 0.289, "step": 780 }, { "epoch": 0.5261030650050522, "grad_norm": 0.4978947043418884, "learning_rate": 8.125825057830323e-06, "loss": 0.2817, "step": 781 }, { "epoch": 0.5267766924890536, "grad_norm": 0.5068449378013611, "learning_rate": 8.108235496149892e-06, "loss": 0.2549, "step": 782 }, { "epoch": 0.5274503199730549, "grad_norm": 0.5815426111221313, "learning_rate": 8.090642566184896e-06, "loss": 0.3215, "step": 783 }, { "epoch": 0.5281239474570563, "grad_norm": 0.528716504573822, "learning_rate": 8.073046365361404e-06, "loss": 0.2405, "step": 784 }, { "epoch": 0.5287975749410576, "grad_norm": 0.5129048824310303, "learning_rate": 8.0554469911236e-06, "loss": 0.2696, "step": 785 }, { "epoch": 0.529471202425059, "grad_norm": 0.5234351754188538, "learning_rate": 8.037844540933245e-06, "loss": 0.2608, "step": 786 }, { "epoch": 0.5301448299090603, "grad_norm": 0.531194269657135, "learning_rate": 8.020239112269131e-06, "loss": 0.2826, "step": 787 }, { "epoch": 0.5308184573930617, "grad_norm": 0.5546161532402039, "learning_rate": 8.002630802626538e-06, "loss": 0.2635, "step": 788 }, { "epoch": 0.531492084877063, "grad_norm": 0.5576707124710083, "learning_rate": 7.985019709516714e-06, "loss": 0.2591, "step": 789 }, { "epoch": 0.5321657123610644, "grad_norm": 0.5075989961624146, "learning_rate": 7.967405930466305e-06, "loss": 0.2751, "step": 790 }, { "epoch": 0.5328393398450657, "grad_norm": 0.547538161277771, "learning_rate": 7.94978956301685e-06, "loss": 0.2767, "step": 791 }, { "epoch": 0.5335129673290671, "grad_norm": 0.6105408072471619, "learning_rate": 7.932170704724202e-06, "loss": 0.3202, "step": 792 }, { "epoch": 0.5341865948130684, "grad_norm": 0.517285943031311, "learning_rate": 7.914549453158025e-06, "loss": 0.2497, "step": 793 }, { "epoch": 0.5348602222970698, "grad_norm": 0.5324558615684509, "learning_rate": 7.896925905901223e-06, "loss": 0.2804, "step": 794 }, { "epoch": 0.5355338497810711, "grad_norm": 0.5467241406440735, "learning_rate": 7.879300160549423e-06, "loss": 0.274, "step": 795 }, { "epoch": 0.5362074772650725, "grad_norm": 0.5673408508300781, "learning_rate": 7.86167231471042e-06, "loss": 0.2681, "step": 796 }, { "epoch": 0.5368811047490738, "grad_norm": 0.5435929298400879, "learning_rate": 7.844042466003643e-06, "loss": 0.2456, "step": 797 }, { "epoch": 0.5375547322330751, "grad_norm": 0.5365129113197327, "learning_rate": 7.826410712059607e-06, "loss": 0.2433, "step": 798 }, { "epoch": 0.5382283597170765, "grad_norm": 0.556115984916687, "learning_rate": 7.808777150519384e-06, "loss": 0.2723, "step": 799 }, { "epoch": 0.5389019872010778, "grad_norm": 0.6075104475021362, "learning_rate": 7.791141879034055e-06, "loss": 0.3197, "step": 800 }, { "epoch": 0.5389019872010778, "eval_loss": 0.25853946805000305, "eval_runtime": 105.3349, "eval_samples_per_second": 47.468, "eval_steps_per_second": 2.971, "step": 800 }, { "epoch": 0.5395756146850792, "grad_norm": 0.5173077583312988, "learning_rate": 7.773504995264167e-06, "loss": 0.2458, "step": 801 }, { "epoch": 0.5402492421690805, "grad_norm": 0.5317369699478149, "learning_rate": 7.755866596879203e-06, "loss": 0.2535, "step": 802 }, { "epoch": 0.5409228696530819, "grad_norm": 0.5028438568115234, "learning_rate": 7.738226781557024e-06, "loss": 0.2558, "step": 803 }, { "epoch": 0.5415964971370832, "grad_norm": 0.4917846918106079, "learning_rate": 7.720585646983346e-06, "loss": 0.2567, "step": 804 }, { "epoch": 0.5422701246210846, "grad_norm": 0.5413616299629211, "learning_rate": 7.702943290851183e-06, "loss": 0.3068, "step": 805 }, { "epoch": 0.5429437521050859, "grad_norm": 0.5557405352592468, "learning_rate": 7.685299810860319e-06, "loss": 0.2807, "step": 806 }, { "epoch": 0.5436173795890872, "grad_norm": 0.5536317229270935, "learning_rate": 7.667655304716762e-06, "loss": 0.2535, "step": 807 }, { "epoch": 0.5442910070730885, "grad_norm": 0.6285427808761597, "learning_rate": 7.650009870132202e-06, "loss": 0.2687, "step": 808 }, { "epoch": 0.5449646345570899, "grad_norm": 0.5142940282821655, "learning_rate": 7.632363604823466e-06, "loss": 0.2328, "step": 809 }, { "epoch": 0.5456382620410912, "grad_norm": 0.5419033765792847, "learning_rate": 7.614716606511986e-06, "loss": 0.2687, "step": 810 }, { "epoch": 0.5463118895250926, "grad_norm": 0.5078312158584595, "learning_rate": 7.597068972923254e-06, "loss": 0.2429, "step": 811 }, { "epoch": 0.5469855170090939, "grad_norm": 0.5140127539634705, "learning_rate": 7.579420801786278e-06, "loss": 0.2358, "step": 812 }, { "epoch": 0.5476591444930953, "grad_norm": 0.5336434841156006, "learning_rate": 7.561772190833041e-06, "loss": 0.2561, "step": 813 }, { "epoch": 0.5483327719770966, "grad_norm": 0.4892539978027344, "learning_rate": 7.544123237797967e-06, "loss": 0.2447, "step": 814 }, { "epoch": 0.549006399461098, "grad_norm": 0.5128865838050842, "learning_rate": 7.526474040417368e-06, "loss": 0.2305, "step": 815 }, { "epoch": 0.5496800269450993, "grad_norm": 0.5284186601638794, "learning_rate": 7.508824696428914e-06, "loss": 0.2665, "step": 816 }, { "epoch": 0.5503536544291007, "grad_norm": 0.49982714653015137, "learning_rate": 7.491175303571087e-06, "loss": 0.2361, "step": 817 }, { "epoch": 0.551027281913102, "grad_norm": 0.5274138450622559, "learning_rate": 7.473525959582631e-06, "loss": 0.2542, "step": 818 }, { "epoch": 0.5517009093971034, "grad_norm": 0.5714825987815857, "learning_rate": 7.4558767622020345e-06, "loss": 0.287, "step": 819 }, { "epoch": 0.5523745368811047, "grad_norm": 0.5137256979942322, "learning_rate": 7.438227809166959e-06, "loss": 0.2416, "step": 820 }, { "epoch": 0.5530481643651061, "grad_norm": 0.5832123756408691, "learning_rate": 7.4205791982137215e-06, "loss": 0.2589, "step": 821 }, { "epoch": 0.5537217918491074, "grad_norm": 0.6384348273277283, "learning_rate": 7.402931027076746e-06, "loss": 0.3011, "step": 822 }, { "epoch": 0.5543954193331088, "grad_norm": 0.5485447645187378, "learning_rate": 7.385283393488017e-06, "loss": 0.2596, "step": 823 }, { "epoch": 0.5550690468171101, "grad_norm": 0.5725424885749817, "learning_rate": 7.367636395176536e-06, "loss": 0.278, "step": 824 }, { "epoch": 0.5557426743011115, "grad_norm": 0.49892446398735046, "learning_rate": 7.349990129867802e-06, "loss": 0.2308, "step": 825 }, { "epoch": 0.5564163017851128, "grad_norm": 0.5304402709007263, "learning_rate": 7.332344695283239e-06, "loss": 0.2661, "step": 826 }, { "epoch": 0.5570899292691142, "grad_norm": 0.5314590334892273, "learning_rate": 7.314700189139683e-06, "loss": 0.2545, "step": 827 }, { "epoch": 0.5577635567531155, "grad_norm": 0.5156052112579346, "learning_rate": 7.297056709148819e-06, "loss": 0.2513, "step": 828 }, { "epoch": 0.5584371842371169, "grad_norm": 0.5569677352905273, "learning_rate": 7.279414353016655e-06, "loss": 0.2701, "step": 829 }, { "epoch": 0.5591108117211182, "grad_norm": 0.5068705081939697, "learning_rate": 7.261773218442978e-06, "loss": 0.2578, "step": 830 }, { "epoch": 0.5597844392051196, "grad_norm": 0.5413905382156372, "learning_rate": 7.244133403120797e-06, "loss": 0.2657, "step": 831 }, { "epoch": 0.5604580666891209, "grad_norm": 0.5509982109069824, "learning_rate": 7.226495004735833e-06, "loss": 0.2421, "step": 832 }, { "epoch": 0.5611316941731223, "grad_norm": 0.5037456750869751, "learning_rate": 7.208858120965949e-06, "loss": 0.2366, "step": 833 }, { "epoch": 0.5618053216571236, "grad_norm": 0.45753926038742065, "learning_rate": 7.191222849480618e-06, "loss": 0.2295, "step": 834 }, { "epoch": 0.562478949141125, "grad_norm": 0.5005747079849243, "learning_rate": 7.1735892879403955e-06, "loss": 0.2431, "step": 835 }, { "epoch": 0.5631525766251263, "grad_norm": 0.6139580607414246, "learning_rate": 7.155957533996361e-06, "loss": 0.2954, "step": 836 }, { "epoch": 0.5638262041091276, "grad_norm": 0.4900098443031311, "learning_rate": 7.1383276852895805e-06, "loss": 0.2472, "step": 837 }, { "epoch": 0.564499831593129, "grad_norm": 0.5588510632514954, "learning_rate": 7.120699839450578e-06, "loss": 0.2963, "step": 838 }, { "epoch": 0.5651734590771303, "grad_norm": 0.45477819442749023, "learning_rate": 7.103074094098776e-06, "loss": 0.2459, "step": 839 }, { "epoch": 0.5658470865611317, "grad_norm": 0.5369901061058044, "learning_rate": 7.085450546841977e-06, "loss": 0.2378, "step": 840 }, { "epoch": 0.566520714045133, "grad_norm": 0.5580633878707886, "learning_rate": 7.0678292952757986e-06, "loss": 0.2466, "step": 841 }, { "epoch": 0.5671943415291344, "grad_norm": 0.5392370223999023, "learning_rate": 7.050210436983152e-06, "loss": 0.2847, "step": 842 }, { "epoch": 0.5678679690131357, "grad_norm": 0.5429926514625549, "learning_rate": 7.032594069533694e-06, "loss": 0.2589, "step": 843 }, { "epoch": 0.5685415964971371, "grad_norm": 0.529365062713623, "learning_rate": 7.0149802904832865e-06, "loss": 0.2692, "step": 844 }, { "epoch": 0.5692152239811384, "grad_norm": 0.5019341707229614, "learning_rate": 6.997369197373462e-06, "loss": 0.2501, "step": 845 }, { "epoch": 0.5698888514651398, "grad_norm": 0.5088992714881897, "learning_rate": 6.979760887730873e-06, "loss": 0.2741, "step": 846 }, { "epoch": 0.5705624789491411, "grad_norm": 0.5390922427177429, "learning_rate": 6.962155459066755e-06, "loss": 0.2653, "step": 847 }, { "epoch": 0.5712361064331425, "grad_norm": 0.5300227403640747, "learning_rate": 6.9445530088764015e-06, "loss": 0.2356, "step": 848 }, { "epoch": 0.5719097339171438, "grad_norm": 0.5471487641334534, "learning_rate": 6.926953634638598e-06, "loss": 0.2434, "step": 849 }, { "epoch": 0.5725833614011452, "grad_norm": 0.49165770411491394, "learning_rate": 6.909357433815104e-06, "loss": 0.2539, "step": 850 }, { "epoch": 0.5732569888851465, "grad_norm": 0.5154786705970764, "learning_rate": 6.891764503850109e-06, "loss": 0.2525, "step": 851 }, { "epoch": 0.5739306163691479, "grad_norm": 0.5185630321502686, "learning_rate": 6.874174942169674e-06, "loss": 0.2709, "step": 852 }, { "epoch": 0.5746042438531492, "grad_norm": 0.5015746355056763, "learning_rate": 6.856588846181228e-06, "loss": 0.2522, "step": 853 }, { "epoch": 0.5752778713371506, "grad_norm": 0.5378702282905579, "learning_rate": 6.839006313272989e-06, "loss": 0.2634, "step": 854 }, { "epoch": 0.5759514988211519, "grad_norm": 0.5816572308540344, "learning_rate": 6.82142744081345e-06, "loss": 0.3396, "step": 855 }, { "epoch": 0.5766251263051533, "grad_norm": 0.5909308791160583, "learning_rate": 6.803852326150838e-06, "loss": 0.2834, "step": 856 }, { "epoch": 0.5772987537891546, "grad_norm": 0.5006569623947144, "learning_rate": 6.786281066612564e-06, "loss": 0.212, "step": 857 }, { "epoch": 0.577972381273156, "grad_norm": 0.5730767846107483, "learning_rate": 6.768713759504694e-06, "loss": 0.2998, "step": 858 }, { "epoch": 0.5786460087571573, "grad_norm": 0.5159865617752075, "learning_rate": 6.751150502111406e-06, "loss": 0.2685, "step": 859 }, { "epoch": 0.5793196362411587, "grad_norm": 0.5225328803062439, "learning_rate": 6.733591391694444e-06, "loss": 0.2404, "step": 860 }, { "epoch": 0.57999326372516, "grad_norm": 0.540481686592102, "learning_rate": 6.7160365254926005e-06, "loss": 0.265, "step": 861 }, { "epoch": 0.5806668912091614, "grad_norm": 0.5876161456108093, "learning_rate": 6.698486000721151e-06, "loss": 0.2758, "step": 862 }, { "epoch": 0.5813405186931627, "grad_norm": 0.5269771218299866, "learning_rate": 6.680939914571336e-06, "loss": 0.2497, "step": 863 }, { "epoch": 0.5820141461771641, "grad_norm": 0.5683711171150208, "learning_rate": 6.663398364209817e-06, "loss": 0.2895, "step": 864 }, { "epoch": 0.5826877736611654, "grad_norm": 0.5690784454345703, "learning_rate": 6.645861446778131e-06, "loss": 0.2927, "step": 865 }, { "epoch": 0.5833614011451668, "grad_norm": 0.4923837184906006, "learning_rate": 6.628329259392169e-06, "loss": 0.2294, "step": 866 }, { "epoch": 0.5840350286291681, "grad_norm": 0.5871672630310059, "learning_rate": 6.610801899141618e-06, "loss": 0.2883, "step": 867 }, { "epoch": 0.5847086561131695, "grad_norm": 0.5314139127731323, "learning_rate": 6.593279463089433e-06, "loss": 0.2698, "step": 868 }, { "epoch": 0.5853822835971708, "grad_norm": 0.4713616967201233, "learning_rate": 6.575762048271311e-06, "loss": 0.2551, "step": 869 }, { "epoch": 0.5860559110811722, "grad_norm": 0.5604876279830933, "learning_rate": 6.558249751695129e-06, "loss": 0.2507, "step": 870 }, { "epoch": 0.5867295385651735, "grad_norm": 0.5332925319671631, "learning_rate": 6.54074267034043e-06, "loss": 0.2921, "step": 871 }, { "epoch": 0.5874031660491748, "grad_norm": 0.5870206356048584, "learning_rate": 6.523240901157874e-06, "loss": 0.305, "step": 872 }, { "epoch": 0.5880767935331761, "grad_norm": 0.5209013223648071, "learning_rate": 6.505744541068696e-06, "loss": 0.2504, "step": 873 }, { "epoch": 0.5887504210171774, "grad_norm": 0.5347055196762085, "learning_rate": 6.488253686964189e-06, "loss": 0.26, "step": 874 }, { "epoch": 0.5894240485011788, "grad_norm": 0.5568848252296448, "learning_rate": 6.470768435705146e-06, "loss": 0.2506, "step": 875 }, { "epoch": 0.5900976759851801, "grad_norm": 0.4880235493183136, "learning_rate": 6.45328888412133e-06, "loss": 0.2549, "step": 876 }, { "epoch": 0.5907713034691815, "grad_norm": 0.5328478217124939, "learning_rate": 6.435815129010952e-06, "loss": 0.2892, "step": 877 }, { "epoch": 0.5914449309531828, "grad_norm": 0.5507891178131104, "learning_rate": 6.418347267140113e-06, "loss": 0.295, "step": 878 }, { "epoch": 0.5921185584371842, "grad_norm": 0.5917878150939941, "learning_rate": 6.400885395242284e-06, "loss": 0.2775, "step": 879 }, { "epoch": 0.5927921859211855, "grad_norm": 0.5396655201911926, "learning_rate": 6.383429610017763e-06, "loss": 0.2601, "step": 880 }, { "epoch": 0.5934658134051869, "grad_norm": 0.5640776753425598, "learning_rate": 6.3659800081331375e-06, "loss": 0.2532, "step": 881 }, { "epoch": 0.5941394408891882, "grad_norm": 0.5693733096122742, "learning_rate": 6.348536686220761e-06, "loss": 0.276, "step": 882 }, { "epoch": 0.5948130683731896, "grad_norm": 0.49299901723861694, "learning_rate": 6.331099740878201e-06, "loss": 0.2197, "step": 883 }, { "epoch": 0.5954866958571909, "grad_norm": 0.5112996697425842, "learning_rate": 6.3136692686677204e-06, "loss": 0.2685, "step": 884 }, { "epoch": 0.5961603233411923, "grad_norm": 0.5770703554153442, "learning_rate": 6.2962453661157305e-06, "loss": 0.2439, "step": 885 }, { "epoch": 0.5968339508251936, "grad_norm": 0.5604544878005981, "learning_rate": 6.2788281297122605e-06, "loss": 0.2603, "step": 886 }, { "epoch": 0.597507578309195, "grad_norm": 0.5164006948471069, "learning_rate": 6.261417655910432e-06, "loss": 0.2419, "step": 887 }, { "epoch": 0.5981812057931963, "grad_norm": 0.5085450410842896, "learning_rate": 6.244014041125906e-06, "loss": 0.2714, "step": 888 }, { "epoch": 0.5988548332771977, "grad_norm": 0.5820232629776001, "learning_rate": 6.226617381736361e-06, "loss": 0.2909, "step": 889 }, { "epoch": 0.599528460761199, "grad_norm": 0.5919815301895142, "learning_rate": 6.209227774080969e-06, "loss": 0.3283, "step": 890 }, { "epoch": 0.6002020882452004, "grad_norm": 0.5612049102783203, "learning_rate": 6.191845314459836e-06, "loss": 0.2623, "step": 891 }, { "epoch": 0.6008757157292017, "grad_norm": 0.5206785798072815, "learning_rate": 6.174470099133495e-06, "loss": 0.2391, "step": 892 }, { "epoch": 0.6015493432132031, "grad_norm": 0.5109294652938843, "learning_rate": 6.157102224322357e-06, "loss": 0.2435, "step": 893 }, { "epoch": 0.6022229706972044, "grad_norm": 0.5124114155769348, "learning_rate": 6.13974178620618e-06, "loss": 0.2508, "step": 894 }, { "epoch": 0.6028965981812058, "grad_norm": 0.538691520690918, "learning_rate": 6.1223888809235475e-06, "loss": 0.2742, "step": 895 }, { "epoch": 0.6035702256652071, "grad_norm": 0.4782629609107971, "learning_rate": 6.105043604571319e-06, "loss": 0.215, "step": 896 }, { "epoch": 0.6042438531492085, "grad_norm": 0.48708873987197876, "learning_rate": 6.087706053204106e-06, "loss": 0.2685, "step": 897 }, { "epoch": 0.6049174806332098, "grad_norm": 0.5199108719825745, "learning_rate": 6.070376322833751e-06, "loss": 0.2522, "step": 898 }, { "epoch": 0.6055911081172112, "grad_norm": 0.5264055728912354, "learning_rate": 6.053054509428774e-06, "loss": 0.2702, "step": 899 }, { "epoch": 0.6062647356012125, "grad_norm": 0.5014949440956116, "learning_rate": 6.035740708913861e-06, "loss": 0.2592, "step": 900 }, { "epoch": 0.6062647356012125, "eval_loss": 0.255189448595047, "eval_runtime": 106.7863, "eval_samples_per_second": 46.823, "eval_steps_per_second": 2.931, "step": 900 } ], "logging_steps": 1, "max_steps": 1484, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.93684278954505e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }