{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.006976358863899959, "eval_steps": 13, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00013952717727799919, "grad_norm": 0.2509855329990387, "learning_rate": 1e-05, "loss": 1.1406, "step": 1 }, { "epoch": 0.00013952717727799919, "eval_loss": 1.3329986333847046, "eval_runtime": 390.6167, "eval_samples_per_second": 30.902, "eval_steps_per_second": 15.452, "step": 1 }, { "epoch": 0.00027905435455599837, "grad_norm": 0.28208601474761963, "learning_rate": 2e-05, "loss": 1.2395, "step": 2 }, { "epoch": 0.00041858153183399753, "grad_norm": 0.2632408142089844, "learning_rate": 3e-05, "loss": 1.1857, "step": 3 }, { "epoch": 0.0005581087091119967, "grad_norm": 0.26442626118659973, "learning_rate": 4e-05, "loss": 1.2206, "step": 4 }, { "epoch": 0.0006976358863899959, "grad_norm": 0.2703617513179779, "learning_rate": 5e-05, "loss": 1.2395, "step": 5 }, { "epoch": 0.0008371630636679951, "grad_norm": 0.2943132817745209, "learning_rate": 6e-05, "loss": 1.3794, "step": 6 }, { "epoch": 0.0009766902409459942, "grad_norm": 0.3263092041015625, "learning_rate": 7e-05, "loss": 1.2371, "step": 7 }, { "epoch": 0.0011162174182239935, "grad_norm": 0.28822728991508484, "learning_rate": 8e-05, "loss": 1.1173, "step": 8 }, { "epoch": 0.0012557445955019926, "grad_norm": 0.3245331048965454, "learning_rate": 9e-05, "loss": 1.114, "step": 9 }, { "epoch": 0.0013952717727799917, "grad_norm": 0.36671578884124756, "learning_rate": 0.0001, "loss": 1.3002, "step": 10 }, { "epoch": 0.001534798950057991, "grad_norm": 0.35024723410606384, "learning_rate": 9.98458666866564e-05, "loss": 1.1694, "step": 11 }, { "epoch": 0.0016743261273359901, "grad_norm": 0.3728810250759125, "learning_rate": 9.938441702975689e-05, "loss": 1.2388, "step": 12 }, { "epoch": 0.0018138533046139895, "grad_norm": 0.42505979537963867, "learning_rate": 9.861849601988383e-05, "loss": 1.1116, "step": 13 }, { "epoch": 0.0018138533046139895, "eval_loss": 1.2273173332214355, "eval_runtime": 293.5985, "eval_samples_per_second": 41.114, "eval_steps_per_second": 20.559, "step": 13 }, { "epoch": 0.0019533804818919883, "grad_norm": 0.37883734703063965, "learning_rate": 9.755282581475769e-05, "loss": 1.2951, "step": 14 }, { "epoch": 0.002092907659169988, "grad_norm": 0.32293638586997986, "learning_rate": 9.619397662556435e-05, "loss": 1.0587, "step": 15 }, { "epoch": 0.002232434836447987, "grad_norm": 0.3402497172355652, "learning_rate": 9.45503262094184e-05, "loss": 1.1641, "step": 16 }, { "epoch": 0.002371962013725986, "grad_norm": 0.3069184720516205, "learning_rate": 9.263200821770461e-05, "loss": 1.1247, "step": 17 }, { "epoch": 0.002511489191003985, "grad_norm": 0.2706042230129242, "learning_rate": 9.045084971874738e-05, "loss": 1.0705, "step": 18 }, { "epoch": 0.0026510163682819843, "grad_norm": 0.29615721106529236, "learning_rate": 8.802029828000156e-05, "loss": 1.1871, "step": 19 }, { "epoch": 0.0027905435455599834, "grad_norm": 0.23046638071537018, "learning_rate": 8.535533905932738e-05, "loss": 1.2344, "step": 20 }, { "epoch": 0.002930070722837983, "grad_norm": 0.2650396227836609, "learning_rate": 8.247240241650918e-05, "loss": 1.141, "step": 21 }, { "epoch": 0.003069597900115982, "grad_norm": 0.5365617275238037, "learning_rate": 7.938926261462366e-05, "loss": 1.171, "step": 22 }, { "epoch": 0.003209125077393981, "grad_norm": 0.24520841240882874, "learning_rate": 7.612492823579745e-05, "loss": 1.0765, "step": 23 }, { "epoch": 0.0033486522546719803, "grad_norm": 0.25750038027763367, "learning_rate": 7.269952498697734e-05, "loss": 1.0613, "step": 24 }, { "epoch": 0.0034881794319499794, "grad_norm": 0.2683405578136444, "learning_rate": 6.91341716182545e-05, "loss": 1.1566, "step": 25 }, { "epoch": 0.003627706609227979, "grad_norm": 0.2555634081363678, "learning_rate": 6.545084971874738e-05, "loss": 1.0815, "step": 26 }, { "epoch": 0.003627706609227979, "eval_loss": 1.1382033824920654, "eval_runtime": 293.8371, "eval_samples_per_second": 41.081, "eval_steps_per_second": 20.542, "step": 26 }, { "epoch": 0.003767233786505978, "grad_norm": 0.22712597250938416, "learning_rate": 6.167226819279528e-05, "loss": 1.0138, "step": 27 }, { "epoch": 0.003906760963783977, "grad_norm": 0.2388458102941513, "learning_rate": 5.782172325201155e-05, "loss": 1.1088, "step": 28 }, { "epoch": 0.004046288141061977, "grad_norm": 0.2228991836309433, "learning_rate": 5.392295478639225e-05, "loss": 1.0923, "step": 29 }, { "epoch": 0.004185815318339976, "grad_norm": 0.23074015974998474, "learning_rate": 5e-05, "loss": 1.1228, "step": 30 }, { "epoch": 0.004325342495617975, "grad_norm": 0.23124848306179047, "learning_rate": 4.607704521360776e-05, "loss": 1.1397, "step": 31 }, { "epoch": 0.004464869672895974, "grad_norm": 0.23561260104179382, "learning_rate": 4.2178276747988446e-05, "loss": 1.1264, "step": 32 }, { "epoch": 0.004604396850173973, "grad_norm": 0.20774266123771667, "learning_rate": 3.832773180720475e-05, "loss": 1.0459, "step": 33 }, { "epoch": 0.004743924027451972, "grad_norm": 0.22472628951072693, "learning_rate": 3.4549150281252636e-05, "loss": 1.0317, "step": 34 }, { "epoch": 0.004883451204729971, "grad_norm": 0.2344016581773758, "learning_rate": 3.086582838174551e-05, "loss": 1.0939, "step": 35 }, { "epoch": 0.00502297838200797, "grad_norm": 0.23103342950344086, "learning_rate": 2.7300475013022663e-05, "loss": 0.9759, "step": 36 }, { "epoch": 0.0051625055592859695, "grad_norm": 0.3125031590461731, "learning_rate": 2.3875071764202563e-05, "loss": 1.1596, "step": 37 }, { "epoch": 0.005302032736563969, "grad_norm": 0.23054346442222595, "learning_rate": 2.061073738537635e-05, "loss": 0.9864, "step": 38 }, { "epoch": 0.005441559913841968, "grad_norm": 0.2276124805212021, "learning_rate": 1.7527597583490822e-05, "loss": 1.0919, "step": 39 }, { "epoch": 0.005441559913841968, "eval_loss": 1.1175899505615234, "eval_runtime": 294.0449, "eval_samples_per_second": 41.052, "eval_steps_per_second": 20.527, "step": 39 }, { "epoch": 0.005581087091119967, "grad_norm": 0.2677469253540039, "learning_rate": 1.4644660940672627e-05, "loss": 1.1697, "step": 40 }, { "epoch": 0.005720614268397967, "grad_norm": 0.21872298419475555, "learning_rate": 1.1979701719998453e-05, "loss": 1.0556, "step": 41 }, { "epoch": 0.005860141445675966, "grad_norm": 0.25124436616897583, "learning_rate": 9.549150281252633e-06, "loss": 0.9909, "step": 42 }, { "epoch": 0.005999668622953965, "grad_norm": 0.23902897536754608, "learning_rate": 7.367991782295391e-06, "loss": 1.168, "step": 43 }, { "epoch": 0.006139195800231964, "grad_norm": 0.21240876615047455, "learning_rate": 5.449673790581611e-06, "loss": 1.0213, "step": 44 }, { "epoch": 0.006278722977509963, "grad_norm": 0.24014732241630554, "learning_rate": 3.8060233744356633e-06, "loss": 1.0644, "step": 45 }, { "epoch": 0.006418250154787962, "grad_norm": 0.21955840289592743, "learning_rate": 2.4471741852423237e-06, "loss": 0.952, "step": 46 }, { "epoch": 0.006557777332065961, "grad_norm": 0.21186882257461548, "learning_rate": 1.3815039801161721e-06, "loss": 0.9789, "step": 47 }, { "epoch": 0.0066973045093439605, "grad_norm": 0.26049795746803284, "learning_rate": 6.15582970243117e-07, "loss": 0.993, "step": 48 }, { "epoch": 0.00683683168662196, "grad_norm": 0.22005507349967957, "learning_rate": 1.5413331334360182e-07, "loss": 0.9553, "step": 49 }, { "epoch": 0.006976358863899959, "grad_norm": 0.21095995604991913, "learning_rate": 0.0, "loss": 1.0059, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.69912849629184e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }