winglian commited on
Commit
d4f6c65
·
unverified ·
1 Parent(s): a944f7b

cleanup the deepspeed proxy model at the end of training (#1675)

Browse files
Files changed (1) hide show
  1. src/axolotl/train.py +7 -0
src/axolotl/train.py CHANGED
@@ -197,6 +197,13 @@ def train(
197
  trainer.accelerator.wait_for_everyone()
198
  unwrapped_model = trainer.accelerator.unwrap_model(trainer.model_wrapped)
199
 
 
 
 
 
 
 
 
200
  # Saves the whole/unpartitioned fp16 model when in ZeRO Stage-3 to the output directory if
201
  # `stage3_gather_16bit_weights_on_model_save` is True in DeepSpeed Config file or
202
  # `zero3_save_16bit_model` is True in DeepSpeed Plugin.
 
197
  trainer.accelerator.wait_for_everyone()
198
  unwrapped_model = trainer.accelerator.unwrap_model(trainer.model_wrapped)
199
 
200
+ # the trainer saved a model.safetensors file in the output directory,
201
+ # but it is a proxy model and should be deleted
202
+ if os.path.exists(os.path.join(cfg.output_dir, "model.safetensors")):
203
+ LOG.info(f"Deleting {os.path.join(cfg.output_dir, 'model.safetensors')}")
204
+ LOG.info("This is a proxy model and should be deleted")
205
+ os.remove(os.path.join(cfg.output_dir, "model.safetensors"))
206
+
207
  # Saves the whole/unpartitioned fp16 model when in ZeRO Stage-3 to the output directory if
208
  # `stage3_gather_16bit_weights_on_model_save` is True in DeepSpeed Config file or
209
  # `zero3_save_16bit_model` is True in DeepSpeed Plugin.