cleanup the deepspeed proxy model at the end of training (#1675)
Browse files- src/axolotl/train.py +7 -0
src/axolotl/train.py
CHANGED
@@ -197,6 +197,13 @@ def train(
|
|
197 |
trainer.accelerator.wait_for_everyone()
|
198 |
unwrapped_model = trainer.accelerator.unwrap_model(trainer.model_wrapped)
|
199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
# Saves the whole/unpartitioned fp16 model when in ZeRO Stage-3 to the output directory if
|
201 |
# `stage3_gather_16bit_weights_on_model_save` is True in DeepSpeed Config file or
|
202 |
# `zero3_save_16bit_model` is True in DeepSpeed Plugin.
|
|
|
197 |
trainer.accelerator.wait_for_everyone()
|
198 |
unwrapped_model = trainer.accelerator.unwrap_model(trainer.model_wrapped)
|
199 |
|
200 |
+
# the trainer saved a model.safetensors file in the output directory,
|
201 |
+
# but it is a proxy model and should be deleted
|
202 |
+
if os.path.exists(os.path.join(cfg.output_dir, "model.safetensors")):
|
203 |
+
LOG.info(f"Deleting {os.path.join(cfg.output_dir, 'model.safetensors')}")
|
204 |
+
LOG.info("This is a proxy model and should be deleted")
|
205 |
+
os.remove(os.path.join(cfg.output_dir, "model.safetensors"))
|
206 |
+
|
207 |
# Saves the whole/unpartitioned fp16 model when in ZeRO Stage-3 to the output directory if
|
208 |
# `stage3_gather_16bit_weights_on_model_save` is True in DeepSpeed Config file or
|
209 |
# `zero3_save_16bit_model` is True in DeepSpeed Plugin.
|