{ "best_metric": 1.613356113433838, "best_model_checkpoint": "output/checkpoint-6000", "epoch": 1.8028846153846154, "eval_steps": 2000, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030048076923076924, "grad_norm": 2.3787589073181152, "learning_rate": 2e-05, "loss": 3.6173, "step": 100 }, { "epoch": 0.06009615384615385, "grad_norm": 1.996222734451294, "learning_rate": 4e-05, "loss": 1.7498, "step": 200 }, { "epoch": 0.09014423076923077, "grad_norm": 1.5797070264816284, "learning_rate": 6e-05, "loss": 1.7224, "step": 300 }, { "epoch": 0.1201923076923077, "grad_norm": 1.4705523252487183, "learning_rate": 8e-05, "loss": 1.6887, "step": 400 }, { "epoch": 0.1502403846153846, "grad_norm": 1.807113766670227, "learning_rate": 0.0001, "loss": 1.6883, "step": 500 }, { "epoch": 0.18028846153846154, "grad_norm": 1.211004376411438, "learning_rate": 9.99725705593595e-05, "loss": 1.6908, "step": 600 }, { "epoch": 0.21033653846153846, "grad_norm": 1.194908857345581, "learning_rate": 9.989031233240653e-05, "loss": 1.6865, "step": 700 }, { "epoch": 0.2403846153846154, "grad_norm": 1.125349998474121, "learning_rate": 9.975331557102723e-05, "loss": 1.6796, "step": 800 }, { "epoch": 0.2704326923076923, "grad_norm": 1.2960246801376343, "learning_rate": 9.9561730585003e-05, "loss": 1.6697, "step": 900 }, { "epoch": 0.3004807692307692, "grad_norm": 1.1631627082824707, "learning_rate": 9.931576757709384e-05, "loss": 1.6675, "step": 1000 }, { "epoch": 0.33052884615384615, "grad_norm": 1.0823206901550293, "learning_rate": 9.901569641240883e-05, "loss": 1.6532, "step": 1100 }, { "epoch": 0.3605769230769231, "grad_norm": 1.1251226663589478, "learning_rate": 9.866184632231592e-05, "loss": 1.6625, "step": 1200 }, { "epoch": 0.390625, "grad_norm": 1.27994704246521, "learning_rate": 9.825460554321679e-05, "loss": 1.6463, "step": 1300 }, { "epoch": 0.4206730769230769, "grad_norm": 1.3841296434402466, "learning_rate": 9.779442089058252e-05, "loss": 1.6462, "step": 1400 }, { "epoch": 0.45072115384615385, "grad_norm": 1.1315901279449463, "learning_rate": 9.728179726871762e-05, "loss": 1.6452, "step": 1500 }, { "epoch": 0.4807692307692308, "grad_norm": 1.2269665002822876, "learning_rate": 9.671729711679036e-05, "loss": 1.6449, "step": 1600 }, { "epoch": 0.5108173076923077, "grad_norm": 1.135764479637146, "learning_rate": 9.610153979173711e-05, "loss": 1.6367, "step": 1700 }, { "epoch": 0.5408653846153846, "grad_norm": 1.2129323482513428, "learning_rate": 9.543520088871773e-05, "loss": 1.6271, "step": 1800 }, { "epoch": 0.5709134615384616, "grad_norm": 1.2092684507369995, "learning_rate": 9.471901149986767e-05, "loss": 1.6353, "step": 1900 }, { "epoch": 0.6009615384615384, "grad_norm": 1.2054263353347778, "learning_rate": 9.39537574121601e-05, "loss": 1.6386, "step": 2000 }, { "epoch": 0.6009615384615384, "eval_loss": 1.627854585647583, "eval_runtime": 242.6417, "eval_samples_per_second": 73.141, "eval_steps_per_second": 9.145, "step": 2000 }, { "epoch": 0.6310096153846154, "grad_norm": 1.4155864715576172, "learning_rate": 9.314027824525798e-05, "loss": 1.6322, "step": 2100 }, { "epoch": 0.6610576923076923, "grad_norm": 1.2875721454620361, "learning_rate": 9.22794665303021e-05, "loss": 1.6205, "step": 2200 }, { "epoch": 0.6911057692307693, "grad_norm": 1.2036750316619873, "learning_rate": 9.137226673064603e-05, "loss": 1.6201, "step": 2300 }, { "epoch": 0.7211538461538461, "grad_norm": 1.3741754293441772, "learning_rate": 9.04196742056119e-05, "loss": 1.6197, "step": 2400 }, { "epoch": 0.7512019230769231, "grad_norm": 1.3001148700714111, "learning_rate": 8.942273411840452e-05, "loss": 1.6285, "step": 2500 }, { "epoch": 0.78125, "grad_norm": 1.3223652839660645, "learning_rate": 8.838254028938162e-05, "loss": 1.6323, "step": 2600 }, { "epoch": 0.8112980769230769, "grad_norm": 1.399418592453003, "learning_rate": 8.730023399593876e-05, "loss": 1.6184, "step": 2700 }, { "epoch": 0.8413461538461539, "grad_norm": 1.2166377305984497, "learning_rate": 8.617700272032516e-05, "loss": 1.6165, "step": 2800 }, { "epoch": 0.8713942307692307, "grad_norm": 1.208473563194275, "learning_rate": 8.501407884676479e-05, "loss": 1.616, "step": 2900 }, { "epoch": 0.9014423076923077, "grad_norm": 1.2277066707611084, "learning_rate": 8.381273830931207e-05, "loss": 1.6122, "step": 3000 }, { "epoch": 0.9314903846153846, "grad_norm": 1.204302191734314, "learning_rate": 8.257429919192542e-05, "loss": 1.6186, "step": 3100 }, { "epoch": 0.9615384615384616, "grad_norm": 1.2529523372650146, "learning_rate": 8.130012028229512e-05, "loss": 1.6164, "step": 3200 }, { "epoch": 0.9915865384615384, "grad_norm": 1.513980507850647, "learning_rate": 7.999159958101186e-05, "loss": 1.5971, "step": 3300 }, { "epoch": 1.0216346153846154, "grad_norm": 1.420433521270752, "learning_rate": 7.865017276771173e-05, "loss": 1.4976, "step": 3400 }, { "epoch": 1.0516826923076923, "grad_norm": 1.4616764783859253, "learning_rate": 7.727731162588074e-05, "loss": 1.4486, "step": 3500 }, { "epoch": 1.0817307692307692, "grad_norm": 1.4156601428985596, "learning_rate": 7.587452242804676e-05, "loss": 1.4467, "step": 3600 }, { "epoch": 1.1117788461538463, "grad_norm": 1.3352571725845337, "learning_rate": 7.444334428313112e-05, "loss": 1.4516, "step": 3700 }, { "epoch": 1.1418269230769231, "grad_norm": 1.4259686470031738, "learning_rate": 7.298534744777267e-05, "loss": 1.4466, "step": 3800 }, { "epoch": 1.171875, "grad_norm": 1.4755374193191528, "learning_rate": 7.150213160347743e-05, "loss": 1.446, "step": 3900 }, { "epoch": 1.2019230769230769, "grad_norm": 1.4399892091751099, "learning_rate": 6.999532410148371e-05, "loss": 1.4331, "step": 4000 }, { "epoch": 1.2019230769230769, "eval_loss": 1.6244958639144897, "eval_runtime": 248.0586, "eval_samples_per_second": 71.544, "eval_steps_per_second": 8.945, "step": 4000 }, { "epoch": 1.2319711538461537, "grad_norm": 1.629622220993042, "learning_rate": 6.846657817726882e-05, "loss": 1.4356, "step": 4100 }, { "epoch": 1.2620192307692308, "grad_norm": 1.5240803956985474, "learning_rate": 6.691757113665606e-05, "loss": 1.4403, "step": 4200 }, { "epoch": 1.2920673076923077, "grad_norm": 1.947218894958496, "learning_rate": 6.535000251551231e-05, "loss": 1.452, "step": 4300 }, { "epoch": 1.3221153846153846, "grad_norm": 1.6493359804153442, "learning_rate": 6.376559221505535e-05, "loss": 1.4435, "step": 4400 }, { "epoch": 1.3521634615384617, "grad_norm": 1.6366957426071167, "learning_rate": 6.216607861481659e-05, "loss": 1.4385, "step": 4500 }, { "epoch": 1.3822115384615383, "grad_norm": 1.679699182510376, "learning_rate": 6.055321666533013e-05, "loss": 1.4509, "step": 4600 }, { "epoch": 1.4122596153846154, "grad_norm": 1.5405994653701782, "learning_rate": 5.8928775962640146e-05, "loss": 1.4375, "step": 4700 }, { "epoch": 1.4423076923076923, "grad_norm": 1.5734689235687256, "learning_rate": 5.7294538806739775e-05, "loss": 1.4315, "step": 4800 }, { "epoch": 1.4723557692307692, "grad_norm": 1.6284011602401733, "learning_rate": 5.565229824607143e-05, "loss": 1.4457, "step": 4900 }, { "epoch": 1.5024038461538463, "grad_norm": 1.5765283107757568, "learning_rate": 5.400385611023416e-05, "loss": 1.4374, "step": 5000 }, { "epoch": 1.5324519230769231, "grad_norm": 1.7722498178482056, "learning_rate": 5.235102103305654e-05, "loss": 1.4513, "step": 5100 }, { "epoch": 1.5625, "grad_norm": 1.6080434322357178, "learning_rate": 5.0695606468204095e-05, "loss": 1.4322, "step": 5200 }, { "epoch": 1.5925480769230769, "grad_norm": 1.7113045454025269, "learning_rate": 4.90394286994985e-05, "loss": 1.4372, "step": 5300 }, { "epoch": 1.6225961538461537, "grad_norm": 1.595045566558838, "learning_rate": 4.738430484813162e-05, "loss": 1.4391, "step": 5400 }, { "epoch": 1.6526442307692308, "grad_norm": 1.5261282920837402, "learning_rate": 4.5732050878960816e-05, "loss": 1.4375, "step": 5500 }, { "epoch": 1.6826923076923077, "grad_norm": 1.6319518089294434, "learning_rate": 4.40844796080729e-05, "loss": 1.4269, "step": 5600 }, { "epoch": 1.7127403846153846, "grad_norm": 1.6978216171264648, "learning_rate": 4.244339871380291e-05, "loss": 1.4261, "step": 5700 }, { "epoch": 1.7427884615384617, "grad_norm": 1.748810887336731, "learning_rate": 4.0810608753389864e-05, "loss": 1.4349, "step": 5800 }, { "epoch": 1.7728365384615383, "grad_norm": 1.4340012073516846, "learning_rate": 3.9187901187445675e-05, "loss": 1.4349, "step": 5900 }, { "epoch": 1.8028846153846154, "grad_norm": 1.7568167448043823, "learning_rate": 3.757705641440461e-05, "loss": 1.4318, "step": 6000 }, { "epoch": 1.8028846153846154, "eval_loss": 1.613356113433838, "eval_runtime": 246.395, "eval_samples_per_second": 72.027, "eval_steps_per_second": 9.006, "step": 6000 } ], "logging_steps": 100, "max_steps": 9984, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.830080251756462e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }